diff --git a/.gitignore b/.gitignore index 5160f778..580d522e 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,9 @@ repeats.txt build/ xcheck.sh *.fa.fai +*.o +/bin-meson +/build-meson +*.ninja +.ninja* +test-mount.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index d561ee13..2c939c8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(fastafs) # Do this once in a while - find different bugs #set(CMAKE_CXX_COMPILER "clang++") -set(PROJECT_VERSION "1.6.2") +set(PROJECT_VERSION "1.7.0") set(PACKAGE_URL "https://github.com/yhoogstrate/fastafs") set(PACKAGE_BUGREPORT "${PACKAGE_URL}/issues") @@ -41,7 +41,11 @@ else() set(DEBUG "false") endif() -configure_file("include/config.hpp.in" "include/config.hpp") +configure_file("include/config.hpp.in" "include/config.hpp")# implies building is done from project root +#configure_file("include/config.hpp.in" "${CMAKE_CURRENT_BINARY_DIR}/config.hpp") +#configure_file("include/config.hpp.in" "${CMAKE_CURRENT_SOURCE_DIR}/config.hpp") +configure_file("include/config.hpp.in" "${BUILD_DIR}/include/config.hpp") + # ---------------------------------------------------------------------- # ------------------------------ Styling ------------------------------- @@ -75,7 +79,9 @@ add_custom_target(tidy DEPENDS make_tidy ) add_subdirectory(src) include_directories(include) -add_definitions(-std=c++17) +#include_directories(${BUILD_DIR}) +include_directories("${BUILD_DIR}/include") +add_definitions(-std=c++14) # Boost find_package(Boost COMPONENTS unit_test_framework REQUIRED) @@ -87,9 +93,11 @@ else() include_directories(${Boost_INCLUDE_DIRS}) endif() + link_libraries(ssl) link_libraries(crypto) link_libraries(fuse) +link_libraries(z)# zlib; -lz; for crc32 checks on whole file integrity if(DEBUG) @@ -100,13 +108,17 @@ endif() add_executable(fastafs src/main.cpp - src/fasta_to_fastafs.cpp + src/fasta_to_twobit_fastafs.cpp + src/fasta_to_fourbit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp + src/flags.cpp src/fastafs.cpp src/ucsc2bit.cpp src/twobit_byte.cpp + src/fourbit_byte.cpp src/database.cpp src/utils.cpp + src/sequence_region.cpp src/fuse.cpp src/lsfastafs.cpp ) @@ -115,24 +127,29 @@ set_target_properties(fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}" # mount-only binary, without all the other stuff 'mount.fastafs' [for fstab] add_executable(mount.fastafs src/main_mount.cpp - src/fasta_to_fastafs.cpp + src/fasta_to_twobit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp + src/flags.cpp src/fastafs.cpp src/ucsc2bit.cpp src/twobit_byte.cpp + src/fourbit_byte.cpp src/database.cpp src/utils.cpp + src/sequence_region.cpp src/fuse.cpp src/lsfastafs.cpp ) set_target_properties(mount.fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}") add_library(libfastafs SHARED - src/fasta_to_fastafs.cpp + src/fasta_to_twobit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp + src/flags.cpp src/fastafs.cpp src/ucsc2bit.cpp src/twobit_byte.cpp + src/fourbit_byte.cpp src/database.cpp src/utils.cpp src/fuse.cpp @@ -146,11 +163,11 @@ set_target_properties(libfastafs PROPERTIES VERSION ${PROJECT_VERSION}) set_target_properties(libfastafs PROPERTIES SOVERSION 1) set_target_properties(libfastafs PROPERTIES OUTPUT_NAME fastafs) -#set_target_properties(libfastafs PROPERTIES HEADER_OUTPUT_DIRECTORY "include") -# great, this doesn't go automagically with an entire dir -set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_fastafs.hpp;include/fuse.hpp;include/meson.build;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp") -#set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_DIRECTORY include) -#set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_OUTPUT_DIRECTORY "include") +##set_target_properties(libfastafs PROPERTIES HEADER_OUTPUT_DIRECTORY "include") +## great, this doesn't go automagically with an entire dir +set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_fourbit_fastafs.hpp;include/fasta_to_twobit_fastafs.hpp;include/flags.hpp;include/fourbit_byte.hpp;include/fuse.hpp;include/lsfastafs.hpp;include/sequence_region.hpp;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp") +##set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_DIRECTORY include) +##set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_OUTPUT_DIRECTORY "include") # ---------------------------------------------------------------------- # ------------------------------ Testing ------------------------------- @@ -161,16 +178,20 @@ enable_testing() add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) # 'make check' as alias for 'make test' -add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") -add_test(test_cache "${BUILD_TEST_DIR}/test_cache") +add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N) | ACUG(N) +add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-) +add_test(test_cache_twobit "${BUILD_TEST_DIR}/test_cache_twobit") +add_test(test_cache_fourbit "${BUILD_TEST_DIR}/test_cache_fourbit") add_test(test_view "${BUILD_TEST_DIR}/test_view") -#add_test(test_tree "${BUILD_TEST_DIR}/test_tree") +add_test(test_flags "${BUILD_TEST_DIR}/test_flags") add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs") +add_test(test_check "${BUILD_TEST_DIR}/test_check") # file integrity checks add_test(test_fastafs_as_ucsc2bit "${BUILD_TEST_DIR}/test_fastafs_as_ucsc2bit") add_test(test_ucsc2bit_to_fastafs "${BUILD_TEST_DIR}/test_ucsc2bit_to_fastafs") add_test(test_ucsc2bit_as_fasta "${BUILD_TEST_DIR}/test_ucsc2bit_as_fasta") +add_test(test_sequenceregion "${BUILD_TEST_DIR}/test_sequenceregion") add_test(test_utils "${BUILD_TEST_DIR}/test_utils") - +#add_test(test_tree "${BUILD_TEST_DIR}/test_tree") #find_program(CTEST_MEMORYCHECK_COMMAND NAMES valgrind) # 'ctest -T memcheck' #INCLUDE(Dart) @@ -181,6 +202,8 @@ add_test(test_utils "${BUILD_TEST_DIR}/test_utils") # The compiled binary, usually to: /usr/local/bin/fastafs install(TARGETS fastafs DESTINATION "bin") install(TARGETS mount.fastafs DESTINATION "bin") + +# don't build during debug at least install(TARGETS libfastafs LIBRARY DESTINATION "lib" PUBLIC_HEADER DESTINATION "include/libfastafs") # ---------------------------------------------------------------------- diff --git a/Changelog b/Changelog index 7d6166b6..59257bf3 100644 --- a/Changelog +++ b/Changelog @@ -1,3 +1,23 @@ +2020-01-27 Youri Hoogstrate + + * v1.7.0 + * `fastafs cache -o` for custom output files and bypassing the config + * Random access subsequence retrieval diretly via filesystem: `/seq/chr1:100-200` + * Implements CRC32 checksums for whole-file integritity + * Converting to meson because of insane build times using cmake+make and re-building files that have not changed + * `fastafs view|mount -m/--no-masking` virtualises fasta files without masking (uppercase) + * Minor support for building with meson and ninja + * cmake template allows building for guix (+guix file provided) + * Changed requirement from c++17 on c++14 to avoid large compatibility issues + * Implements bitflags with corresponding class + * Implements fourbit (and automatically switches over if non ACTGUN chars are found + * Implements functions `is_fasta_file`, and `is_ucsc2bit_file` using file MAGIC + * Creates by FASTAFS files that are first flagged as incomplete, that are unflagged after conversion has completed + * MD5sums working for fourbit compressed sequences + * Implements `fastafs cache -o` to export to desired output fastafs file + * Adds compression type to `fastafs list` output + * More and improved testing, including file integrity detection + 2019-09-06 Youri Hoogstrate * v1.6.2 diff --git a/README.md b/README.md index 5cff4e4b..0845ed9e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# fastafs: fuse layer and file system for storing FASTA files +# FASTAFS: toolkit for file system virtualisation of random access compressed FASTA files ---- @@ -27,7 +27,7 @@ Required dependencies are: - libboost (only for unit testing, will be come an optional dependency soon) - libopenssl (for generating MD5 hashes) - libfuse (for access to the fuse layer system and file virtualization) - - c++ compiler supporting c++-17 + - c++ compiler supporting c++-14 Compilation is done using cmake. The build command to run cmake for common use is: diff --git a/bin/.gitignore b/bin/.gitignore deleted file mode 100644 index 9f8bdef4..00000000 --- a/bin/.gitignore +++ /dev/null @@ -1 +0,0 @@ -fastafs diff --git a/build-debug.sh b/build-debug.sh index 6eb62a47..eec58e92 100755 --- a/build-debug.sh +++ b/build-debug.sh @@ -1,5 +1,11 @@ #!/bin/bash +#cmake -GNinja -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . +#ninja -j`nproc` +#ninja install + + +## using make - sometimes much slower cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . -make "$@" -j `nproc` +make "$@" -j $(nproc) make install diff --git a/build-release-meson.sh b/build-release-meson.sh new file mode 100755 index 00000000..b76c7eb2 --- /dev/null +++ b/build-release-meson.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +meson bin-meson +cd bin-meson +ninja diff --git a/build-release.sh b/build-release.sh index 58278282..5f240a76 100755 --- a/build-release.sh +++ b/build-release.sh @@ -1,5 +1,7 @@ #!/bin/bash +#cmake -GNinja -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . + cmake -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . -make "$@" -j `nproc` -sudo make install +make "$@" -j $(nproc) +make install diff --git a/doc/4bit.txt b/doc/4bit.txt new file mode 100644 index 00000000..8e24e65d --- /dev/null +++ b/doc/4bit.txt @@ -0,0 +1,18 @@ +A 0000 +B 0001 +C 0010 +D 0011 +G 0100 +H 0101 +K 0110 +M 0111 +N 1000 +R 1001 +S 1010 +T 1011 +U 1100 +V 1101 +W 1110 +Y 1111 + +- by idx diff --git a/doc/FASTAFS-FORMAT-SPECIFICATION.md b/doc/FASTAFS-FORMAT-SPECIFICATION.md index 0a1580e2..cf1ce0d8 100644 --- a/doc/FASTAFS-FORMAT-SPECIFICATION.md +++ b/doc/FASTAFS-FORMAT-SPECIFICATION.md @@ -21,7 +21,7 @@ If this metadata would be written in the header located before the sequence data | GENERIC-HEADER | | | | | | [MAGIC](#magic) | 4 bytes | `x0F x0A x46 x53` | | | [FILE FORMAT VERSION](#file-format-version) | [4-byte integer](#four-byte-integer) | `x00 x00 x00 x00` | -| | [FASTAFS-FLAG](#fastafs-flag) | 2 bytes | Certain binary flags | +| | [FASTAFS-FLAGS](#fastafs-flags) | 2 bytes | Certain binary flags | | | [FILE-POSITION-OF-INDEX](#file-position-of-the-index) | [4-byte integer](#four-byte-integer) | Location in the file (offset in bytes from beginning) where the INDEX is located | | DATA | --- | --- | --- | | -> per sequence | @@ -40,7 +40,7 @@ If this metadata would be written in the header located before the sequence data | INDEX | --- | --- | | | | NUMBER-SEQUENCES | uint32_t as [4-byte integer](#four-byte-integer) | Number of sequences included | | -> per sequence | -| | [SEQUENCE-FLAG](#sequence-flag) | 2 bytes | storing metadata and type of data | +| | [SEQUENCE-FLAGS](#sequence-flags) | 2 bytes | storing metadata and type of data | | | NAME-LENGTH | 1 byte as unsigned char | length in bytes; name cannot exceed 255 bytes | | | NAME-FASTA | NAME-LENGTH x char | FASTA header; may not include new-lines or '>' | | | START-POSITION-IN-BODY of N-COMPR-NUC | uint32_t as [4-byte integer](#four-byte-integer) | Location in the file (offset in bytes from beginning) where the DATA block for this sequence starts | @@ -50,7 +50,7 @@ If this metadata would be written in the header located before the sequence data | | METADATA-TYPE-FLAG | 2 bytes | | | ENTRY | type specific, examples below: | | | => ORIGINAL PADDING | uint32_t as [4-byte integer](#four-byte-integer) | The number of nucleotides per line in the original FASTA file | - +| CRC32 | Checksum on entire file | 4 bytes | To ensure whole file integrity | ### GENERIC-HEADER ### @@ -80,7 +80,7 @@ The bit representation of these bytes are: +--------+--------+--------+--------+ ``` -#### FASTAFS-FLAG #### +#### FASTAFS-FLAGS #### ``` bit 0 file-complete @@ -115,13 +115,23 @@ The index is located at the end of the data. This file offset in bytes from the Repeated for every sequence, in order matching SEQUENCE-HEADER -#### SEQUENCE-FLAG #### +#### SEQUENCE-FLAGS #### The sequence flag allows to describe the following metadata for each sequence: ``` -bit 0 is-rna [1 = yes, 0 = DNA] -bit 1 reserved [reserved, library type 2 -> protein] +bit 0 combined sequence type +bit 1 combined sequence type +``` + +| bit-0 | bit-1 | Type | Alphabet | +| ---- | ---- | - | - | +| `0` | `0` | DNA | `ACTG` + `N` | +| `1` | `0` | RNA | `ACUG` + `N` | +| `0` | `1` | IUPEC Nucleotide | `ACGTURYKMSWBDHVN` + `-` | +| `1` | `1` | reserved for protein | to be determined | + +``` bit 2 reserved [reserved, library type 2 -> protein] bit 3 is-complete [1: checksum is present, 0: some regions are reserved but not yet 'downloaded'] bit 4 is-circular diff --git a/guix.scm b/guix.scm new file mode 100644 index 00000000..2e9c87c2 --- /dev/null +++ b/guix.scm @@ -0,0 +1,40 @@ +;; guix package --install-from-file=/home/youri/src/fastafs/guix.scm +;; https://guix.gnu.org/blog/2018/a-packaging-tutorial-for-guix/ + +(use-modules (guix packages) + (guix download) + (guix git-download) + (guix build-system gnu) + (guix build-system cmake) + (guix licenses) + (gnu packages boost) + (gnu packages compression) + (gnu packages tls) + (gnu packages linux)) + +(package + (name "fastafs") + (version "1.7.0") + (source (origin + (method url-fetch) + ; (uri (string-append "https://github.com/yhoogstrate/fastafs/archive/a39eddbf810d7a828d33d6dbe8c913bbffd58948.tar.gz")) + (uri (string-append "file:///home/youri/.local/src/fastafs.tar.gz")) + (sha256 + (base32 + "1njzvaxy1nq4202ispphyxddihq1x1cmfzbl8zmkqiwa028k540c")))) + (build-system cmake-build-system) + (arguments + `(#:build-type "debug" + #:tests? #f) ; skip tests that fail because test data is not in build path + ) + (inputs + `(("boost" ,boost) + ("zlib" ,zlib) + ("openssl" ,openssl) + ("fuse" ,fuse) + )) + (synopsis "fastafs") + (description + "fastafs: toolkit for file system virtualisation of random access compressed FASTA, FAI, DICT & TWOBIT files") + (home-page "https://github.com/yhoogstrate/fastafs") + (license gpl2+)) diff --git a/include/config.hpp.in b/include/config.hpp.in index 25046075..048d2d77 100644 --- a/include/config.hpp.in +++ b/include/config.hpp.in @@ -64,4 +64,8 @@ static const std::string DICT_HEADER = "@HD\tVN:1.0\tSO:unsorted\n"; static const std::string FASTAFS_FILE_XATTR_NAME = "fastafs-file"; static const std::string FASTAFS_PID_XATTR_NAME = "fastafs-pid"; + +static const size_t MAX_SIZE_SEQ_NAME = 255; + + #endif diff --git a/include/fasta_to_fourbit_fastafs.hpp b/include/fasta_to_fourbit_fastafs.hpp new file mode 100644 index 00000000..b26149c0 --- /dev/null +++ b/include/fasta_to_fourbit_fastafs.hpp @@ -0,0 +1,62 @@ + +#include + +#include + +#include "config.hpp" +#include "utils.hpp" + +#include "fastafs.hpp" +#include "fourbit_byte.hpp" + + + +class fasta_seq_header_fourbit_conversion_data +{ +public: + void add_ACTG(unsigned char, std::ofstream &);//Adds a T or a U + void add_N(); + void finish_sequence(std::ofstream &); + + off_t file_offset_in_fasta; // file positions where sequence data blocks start + std::string name; + + uint32_t N;// number of N (unknown) nucleotides (n - N = total 2bit compressed nucleotides) + uint32_t n_actg;// number of non-N nucleotides (any [ACTGU]) + + bool previous_was_N; + + + fasta_seq_header_fourbit_conversion_data(off_t arg_fof, + std::string &arg_name): + file_offset_in_fasta(arg_fof), + name(arg_name), + N(0), + n_actg(0), + previous_was_N(false), + in_m_block(false) + { + MD5_Init(&this->ctx); + } + + + // all below are undefined at initialization + uint32_t padding; + + // the followin should be member of a conversion struct, because they're not related to the original 2bit format: + MD5_CTX ctx; + unsigned char md5_digest[MD5_DIGEST_LENGTH]; + + std::vector n_block_starts; + std::vector n_block_ends; + + std::vector m_block_starts; + std::vector m_block_ends; + bool in_m_block; + + fourbit_byte fourbit_data; +}; + + +size_t fasta_to_fourbit_fastafs(const std::string &, const std::string &); + diff --git a/include/fasta_to_fastafs.hpp b/include/fasta_to_twobit_fastafs.hpp similarity index 86% rename from include/fasta_to_fastafs.hpp rename to include/fasta_to_twobit_fastafs.hpp index c93a8492..9199548a 100644 --- a/include/fasta_to_fastafs.hpp +++ b/include/fasta_to_twobit_fastafs.hpp @@ -11,7 +11,7 @@ -class fasta_seq_header_conversion_data +class fasta_seq_header_twobit_conversion_data { public: void add_ACTG(unsigned char, std::ofstream &);//Adds a T or a U @@ -27,7 +27,7 @@ class fasta_seq_header_conversion_data bool previous_was_N; - fasta_seq_header_conversion_data(off_t fof, std::string name): + fasta_seq_header_twobit_conversion_data(off_t fof, const std::string &name): file_offset_in_fasta(fof), name(name), N(0), @@ -57,5 +57,5 @@ class fasta_seq_header_conversion_data }; -size_t fasta_to_fastafs(const std::string, const std::string); +size_t fasta_to_twobit_fastafs(const std::string &, const std::string &); diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 7335a13a..18922096 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -1,4 +1,10 @@ + +#ifndef FASTAFS_HPP +#define FASTAFS_HPP + + + #include #include @@ -6,8 +12,8 @@ #include "utils.hpp" -#ifndef FASTAFS_HPP -#define FASTAFS_HPP +#include "sequence_region.hpp" +#include "flags.hpp" struct ffs2f_init_seq { @@ -55,7 +61,7 @@ class fastafs_seq uint32_t n;// number nucleotides std::vector n_starts;// start positions (nucleotide positions; 0-based) std::vector n_ends;// end positions (nucleotide positions; 0-based) - uint16_t flag; + fastafs_sequence_flags flags; std::vector m_starts;// start positions (nucleotide positions; 0-based) std::vector m_ends;// end positions (nucleotide positions; 0-based) @@ -69,12 +75,15 @@ class fastafs_seq uint32_t fasta_filesize(uint32_t padding); void view_fasta(ffs2f_init_seq*, std::ifstream *); - uint32_t view_fasta_chunk_cached(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); + size_t view_sequence_region_size(ffs2f_init_seq*, sequence_region*, std::ifstream *); + uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region*, char *, size_t, off_t, std::ifstream *); + uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); + template uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); std::string sha1(ffs2f_init_seq*, std::ifstream*);// sha1 works 'fine' but is, like md5, sensitive to length extension hacks and should actually not be used for identifiers. std::string md5(ffs2f_init_seq*, std::ifstream*);// md5 works 'fine' but is, like sha1, sensitive to length extension hacks and should actually not be used for identifiers. - uint32_t n_twobits(); + uint32_t n_bits(); static uint32_t n_padding(uint32_t, uint32_t, uint32_t); bool get_n_offset(uint32_t, uint32_t *); @@ -101,20 +110,26 @@ class fastafs std::string name; std::string filename; std::vector data; - uint16_t flag; + uint32_t crc32f;// crc32 as found in fastafs file + + fastafs_flags flags; - uint32_t n(); + uint32_t n();// number nucleotdies std::string basename(); void load(std::string); void view_fasta(ffs2f_init*); - uint32_t view_fasta_chunk_cached(ffs2f_init*, char*, size_t, off_t);//@todo remove _cached suffix + size_t view_sequence_region_size(ffs2f_init*, const char *); // read stuff like "chr1:123-456" into the buffer + uint32_t view_sequence_region(ffs2f_init*, const char *, char*, size_t, off_t); // read stuff like "chr1:123-456" into the buffer + uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t); uint32_t view_faidx_chunk(uint32_t, char *, size_t, off_t); uint32_t view_ucsc2bit_chunk(char *, size_t, off_t); size_t view_dict_chunk(char *, size_t, off_t); + uint32_t get_crc32(void);// returns a 'new' crc32, estimated on file contents + size_t fastafs_filesize(void); size_t fasta_filesize(uint32_t); size_t ucsc2bit_filesize(void); size_t dict_filesize(void); @@ -122,7 +137,8 @@ class fastafs std::string get_faidx(uint32_t);//@todo get rid of this, make it full chunked int info(bool); - int check_integrity(void); + bool check_file_integrity(bool); + bool check_sequence_integrity(bool); }; diff --git a/include/flags.hpp b/include/flags.hpp new file mode 100644 index 00000000..cdf837d4 --- /dev/null +++ b/include/flags.hpp @@ -0,0 +1,122 @@ + + +#ifndef FLAGS_HPP +#define FLAGS_HPP + +#include + + +const unsigned char FASTAFS_BITFLAG_COMPLETE = 0; + +const unsigned char FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1 = 0; +const unsigned char FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2 = 1; +// const unsigned char FASTAFS_SEQUENCE_BITFLAG_???? = 2 ; // is reserved +const unsigned char FASTAFS_SEQUENCE_BITFLAG_COMPLETE = 3; +const unsigned char FASTAFS_SEQUENCE_BITFLAG_CIRCULAR = 4; + + + + +constexpr std::array bitmasks = { + 0b1000'0000, // represents bit 7 + 0b0100'0000, // represents bit 6 + 0b0010'0000, // represents bit 5 + 0b0001'0000, // represents bit 4 + 0b0000'1000, // represents bit 3 + 0b0000'0100, // represents bit 2 + 0b0000'0010, // represents bit 1 + 0b0000'0001, // represents bit 0 + + 0b1000'0000, // represents bit 7 + 0b0100'0000, // represents bit 6 + 0b0010'0000, // represents bit 5 + 0b0001'0000, // represents bit 4 + 0b0000'1000, // represents bit 3 + 0b0000'0100, // represents bit 2 + 0b0000'0010, // represents bit 1 + 0b0000'0001, // represents bit 0 +}; + + +//#include "utils.hpp" + + +class twobit_flag +{ +protected: + twobit_flag(); + + std::array bits; // 00000000 00000000 + + // set by flag + void set_flag(unsigned char, bool);// counting flag from bit 0(!) + bool get_flag(unsigned char); + +public: + void set(char *); + std::array &get_bits(void); // get bit 0 or bit 1 +}; + + +class fastafs_flags : public twobit_flag +{ +public: + bool is_complete(); + bool is_incomplete() + { + return !this->is_complete(); + }; + + void set_complete(); + void set_incomplete(); +}; + + + +class fastafs_sequence_flags : public twobit_flag +{ + +public: + bool is_dna(); // alphabet: 'ACTG' + 'N' + bool is_rna(); // alphabet: 'ACUG' + 'N' + bool is_iupec_nucleotide(); // alphabet: 'ACGTURYKMSWBDHVN' + '-' + + bool is_complete(); + bool is_incomplete() + { + return !this->is_complete(); + }; + + bool is_circular(); + bool is_linear() + { + return !this->is_circular(); + }; + + bool is_twobit() + { + return (this->is_dna() | this->is_rna()); + }; + bool is_fourbit() + { + return this->is_iupec_nucleotide(); + }; + + + // set by entity + void set_dna(); + void set_rna(); + void set_iupec_nucleotide(); + + void set_complete(); + void set_incomplete(); + + void set_linear(); + void set_circular(); + +}; + + + + +#endif diff --git a/include/fourbit_byte.hpp b/include/fourbit_byte.hpp new file mode 100644 index 00000000..89e46957 --- /dev/null +++ b/include/fourbit_byte.hpp @@ -0,0 +1,28 @@ + +#ifndef fourbit_BYTE_HPP +#define fourbit_BYTE_HPP + +#include +#include "config.hpp" + +class fourbit_byte +{ +public: + static const char fourbit_alhpabet[17]; + static const char encode_hash[256][3]; + static const char n_fill_unmasked = '-'; + static const char n_fill_masked = '-'; + + static const char bits_per_nucleotide = 4; + static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; + + unsigned char data; + void set(unsigned char, unsigned char); + void set(char*);// string met 4 bytes set + const char *get(void); + char *get(unsigned char); + + static unsigned char iterator_to_offset(unsigned int); +}; + +#endif diff --git a/include/sequence_region.hpp b/include/sequence_region.hpp new file mode 100644 index 00000000..cb86ed87 --- /dev/null +++ b/include/sequence_region.hpp @@ -0,0 +1,43 @@ + + + +#ifndef SEQUENCE_REGION_HPP +#define SEQUENCE_REGION_HPP + + +#include +#include +#include +#include + +#include + + +#include "config.hpp" + +#include "utils.hpp" + + + +class sequence_region +{ +public: + sequence_region(char *); + sequence_region(const char *); + + std::string seq_name; + + bool has_defined_end; + + off_t start; + off_t end; + +private: + void parse(const char *); +}; + + + +#endif + + diff --git a/include/twobit_byte.hpp b/include/twobit_byte.hpp index 805ddb17..52b4a41b 100644 --- a/include/twobit_byte.hpp +++ b/include/twobit_byte.hpp @@ -8,7 +8,12 @@ class twobit_byte { public: - static const char twobit_hash[256][5]; + static const char encode_hash[256][5]; + static const char n_fill_unmasked = 'N'; + static const char n_fill_masked = 'n'; + + static const char bits_per_nucleotide = 2; + static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; unsigned char data; void set(unsigned char, unsigned char); diff --git a/include/ucsc2bit_to_fastafs.hpp b/include/ucsc2bit_to_fastafs.hpp index 0c078b15..051abfb1 100644 --- a/include/ucsc2bit_to_fastafs.hpp +++ b/include/ucsc2bit_to_fastafs.hpp @@ -27,6 +27,9 @@ struct ucsc2bit_seq_header { uint32_t m_blocks; std::vector m_block_starts; std::vector m_block_sizes; + + ucsc2bit_seq_header(): + name_size(0), name(nullptr), n_blocks(0) { } }; struct ucsc2bit_seq_header_conversion_data { diff --git a/include/utils.hpp b/include/utils.hpp index da6efd2f..8cc89094 100644 --- a/include/utils.hpp +++ b/include/utils.hpp @@ -6,6 +6,7 @@ uint32_t fourbytes_to_uint_ucsc2bit(char *, unsigned char); uint16_t twobytes_to_uint(char *); void uint_to_twobytes(char *chars, uint16_t n); +size_t remove_chars(char *s, int c, size_t l);// to remove - characters from string void uint_to_fourbytes(char *, uint32_t); void uint_to_fourbytes_ucsc2bit(char *, uint32_t); @@ -16,7 +17,8 @@ void md5_digest_to_hash(unsigned char *, char *); std::string std_string_nullbyte_safe(char *, size_t, size_t); std::string std_string_nullbyte_safe(char *, size_t); -bool is_fasta_file(char *filename); +bool is_fasta_file(char *); +bool is_ucsc2bit_file(char *); std::string basename_cpp(std::string); std::string realpath_cpp(std::string); diff --git a/meson.build b/meson.build new file mode 100644 index 00000000..649fb18b --- /dev/null +++ b/meson.build @@ -0,0 +1,42 @@ +project('fastafs', 'cpp', + version : run_command('bash', '-c' , 'grep PROJECT_VERSION CMakeLists.txt | grep -Po \'".+"\' | grep -Po \'[^"]+\'').stdout().strip(), default_options : ['warning_level=3', 'cpp_std=c++14']) + +project_version = run_command('bash', '-c' , 'grep PROJECT_VERSION CMakeLists.txt | grep -Po \'".+"\' | grep -Po \'[^"]+\'').stdout().strip() + + +add_global_arguments('-O3', language : 'cpp') +add_global_arguments('-D_FILE_OFFSET_BITS=64', language : 'cpp') + + +# make config: +# prefix = get_option('prefix') +# https://mesonbuild.com/Configuration.html#a-full-example +conf_data = configuration_data() +conf_data.set('PROJECT_VERSION', project_version) +conf_data.set('CMAKE_PROJECT_NAME', 'fastafs') +conf_data.set('DEBUG', 'false') +configure_file(input : 'include/config.hpp.in', + output : 'config.hpp', + configuration : conf_data) +#configuration_inc = include_directories('include') + + + + + + + +src = ['./src/fasta_to_fourbit_fastafs.cpp', './src/fasta_to_twobit_fastafs.cpp', './src/flags.cpp', './src/fourbit_byte.cpp', './src/twobit_byte.cpp', './src/ucsc2bit.cpp', './src/ucsc2bit_to_fastafs.cpp', './src/fastafs.cpp', './src/fuse.cpp', './src/utils.cpp', './src/database.cpp', './src/lsfastafs.cpp', './src/main.cpp'] + + +incdir = include_directories('include') + +fuse = dependency('fuse') +crypto = dependency('libcrypto') +openssl = dependency('openssl') +zlib = dependency('zlib') + +executable('fastafs', src, + include_directories : incdir, + dependencies: [crypto, openssl, fuse, zlib]) + diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp new file mode 100644 index 00000000..1f9ff5c5 --- /dev/null +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -0,0 +1,627 @@ +#include +#include + +#include + +#include "config.hpp" + +#include "fasta_to_fourbit_fastafs.hpp" +#include "flags.hpp" +#include "utils.hpp" + + + +const static char na[2] = "A"; +const static char nc[2] = "C"; +const static char ng[2] = "G"; +const static char nt[2] = "T"; + +const static char nu[2] = "U"; +const static char nr[2] = "R"; +const static char ny[2] = "Y"; +const static char nk[2] = "K"; +const static char nm[2] = "M"; +const static char ns[2] = "S"; +const static char nw[2] = "W"; +const static char nb[2] = "B"; +const static char nd[2] = "D"; +const static char nh[2] = "H"; +const static char nv[2] = "V"; + +const static char nn[2] = "N"; + +//const char fourbit_byte::fourbit_alhpabet[17] = "ACGTURYKMSWBDHVN"; + + + +void fasta_seq_header_fourbit_conversion_data::add_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) +{ + this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), nucleotide);//0 = TU, 1 = + + // if fourth nucleotide, 2bit is complete; write to disk + if(this->n_actg % 2 == 1) { + fh_fastafs << this->fourbit_data.data; + } + + if(this->previous_was_N) { + this->n_block_ends.push_back(this->n_actg + this->N - 1); + } + + this->previous_was_N = false; + this->n_actg++; +} + +void fasta_seq_header_fourbit_conversion_data::add_N() +{ + if(!this->previous_was_N) { + this->n_block_starts.push_back(this->n_actg + this->N); + } + + this->previous_was_N = true; + this->N++; +} + + + +void fasta_seq_header_fourbit_conversion_data::finish_sequence(std::ofstream &fh_fastafs) +{ + uint32_t j; + + // flush last nucleotide + if(this->n_actg % 2 != 0) { + this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), 0); + + fh_fastafs << this->fourbit_data.data; + } + + if(this->previous_was_N) { + this->n_block_ends.push_back(this->n_actg + this->N - 1); + } + + // do M block + if(this->in_m_block) { + this->m_block_ends.push_back(this->n_actg + this->N - 1); + //printf("closing m-block: %u\n",this->n_actg + this->N - 1); + } + +#if DEBUG + if(this->m_block_starts.size() != this->m_block_ends.size()) { + throw std::runtime_error("M blocks not correctly parsed\n"); + } +#endif //DEBUG + + char buffer[4 + 1]; + + // (over)write number nucleotides + std::streamoff index_file_position = fh_fastafs.tellp(); + fh_fastafs.seekp(this->file_offset_in_fasta, std::ios::beg); + uint_to_fourbytes(buffer, this->n_actg); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + + fh_fastafs.seekp(index_file_position, std::ios::beg); + + // N blocks + uint_to_fourbytes(buffer, (uint32_t) this->n_block_starts.size()); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + for(j = 0; j < this->n_block_starts.size(); j++) { + uint_to_fourbytes(buffer, this->n_block_starts[j]); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + } + for(j = 0; j < this->n_block_ends.size(); j++) { + uint_to_fourbytes(buffer, this->n_block_ends[j]); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + } + + // write checksum + MD5_Final(this->md5_digest, &this->ctx); + fh_fastafs.write(reinterpret_cast(&this->md5_digest), (size_t) 16); + + // M blocks + uint_to_fourbytes(buffer, (uint32_t) this->m_block_starts.size()); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + for(j = 0; j < this->m_block_starts.size(); j++) { + uint_to_fourbytes(buffer, this->m_block_starts[j]); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + } + for(j = 0; j < this->m_block_ends.size(); j++) { + uint_to_fourbytes(buffer, this->m_block_ends[j]); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + } +} + + + +size_t fasta_to_fourbit_fastafs(const std::string &fasta_file, const std::string &fastafs_file) +{ + std::vector index; + fasta_seq_header_fourbit_conversion_data* s; + + fastafs_flags ffsf; + ffsf.set_incomplete(); + + // @todo use ifstream and ofstream argument types + std::string line; + std::ifstream fh_fasta(fasta_file.c_str(), std::ios :: in | std::ios :: binary); + std::ofstream fh_fastafs(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); + s = nullptr; + if(fh_fasta.is_open() and fh_fastafs.is_open()) { + fh_fastafs << FASTAFS_MAGIC; + fh_fastafs << FASTAFS_VERSION; + + // the flag for now, set to INCOMPLETE as writing is in progress || spacer that will be overwritten later + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + + fh_fastafs << "\x00\x00\x00\x00"s;// position of metedata ~ unknown YET + + // iterate until first sequence is found, ensuring we won't write to uninitialized sequences + while(s == nullptr and getline(fh_fasta, line)) { + if(line[0] == '>') { + + // init new sequence + line.erase(0, 1);// erases first part, quicker would be pointer from first char + s = new fasta_seq_header_fourbit_conversion_data(fh_fastafs.tellp(), line); + fh_fastafs << "\x00\x00\x00\x00"s;// placeholder for sequence length + index.push_back(s); + } + } + + if(s != nullptr) { + while(getline(fh_fasta, line)) { + if(line[0] == '>') { + s->finish_sequence(fh_fastafs); + + // init sequence + line.erase(0, 1);// erases first part, quicker would be pointer from first char + s = new fasta_seq_header_fourbit_conversion_data(fh_fastafs.tellp(), line); + fh_fastafs << "\x00\x00\x00\x00"s;// number of 2bit encoded nucleotides, not yet known + index.push_back(s); + } else { + std::cout << "{"; + for(std::string::iterator it = line.begin(); it != line.end(); ++it) { + switch(*it) { + + case 'A': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(0, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'a': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(0, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'C': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(1, fh_fastafs); + MD5_Update(&s->ctx, nc, 1); + break; + case 'c': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(1, fh_fastafs); + MD5_Update(&s->ctx, nc, 1); + break; + case 'G': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(2, fh_fastafs); + MD5_Update(&s->ctx, ng, 1); + break; + case 'g': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(2, fh_fastafs); + MD5_Update(&s->ctx, ng, 1); + break; + case 'T': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(3, fh_fastafs); + MD5_Update(&s->ctx, nt, 1); + break; + case 't': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(3, fh_fastafs); + MD5_Update(&s->ctx, nt, 1); + break; + case 'U': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(4, fh_fastafs); + MD5_Update(&s->ctx, nu, 1); + break; + case 'u': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(4, fh_fastafs); + MD5_Update(&s->ctx, nu, 1); + break; + +// ==================================================== + + case 'R': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(5, fh_fastafs); + MD5_Update(&s->ctx, nr, 1); + break; + case 'r': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(5, fh_fastafs); + MD5_Update(&s->ctx, nr, 1); + break; + case 'Y': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(6, fh_fastafs); + MD5_Update(&s->ctx, ny, 1); + break; + case 'y': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(6, fh_fastafs); + MD5_Update(&s->ctx, ny, 1); + break; + case 'K': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(7, fh_fastafs); + MD5_Update(&s->ctx, nk, 1); + break; + case 'k': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(7, fh_fastafs); + MD5_Update(&s->ctx, nk, 1); + break; + case 'M': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(8, fh_fastafs); + MD5_Update(&s->ctx, nm, 1); + break; + case 'm': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(8, fh_fastafs); + MD5_Update(&s->ctx, nm, 1); + break; + case 'S': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(9, fh_fastafs); + MD5_Update(&s->ctx, ns, 1); + break; + case 's': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(9, fh_fastafs); + MD5_Update(&s->ctx, ns, 1); + break; + case 'W': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(10, fh_fastafs); + MD5_Update(&s->ctx, nw, 1); + break; + case 'w': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(10, fh_fastafs); + MD5_Update(&s->ctx, nw, 1); + break; + case 'B': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(11, fh_fastafs); + MD5_Update(&s->ctx, nb, 1); + break; + case 'b': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(11, fh_fastafs); + MD5_Update(&s->ctx, nb, 1); + break; + case 'D': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(12, fh_fastafs); + MD5_Update(&s->ctx, nd, 1); + break; + case 'd': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(12, fh_fastafs); + MD5_Update(&s->ctx, nd, 1); + break; + case 'H': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(13, fh_fastafs); + MD5_Update(&s->ctx, nh, 1); + break; + case 'h': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(13, fh_fastafs); + MD5_Update(&s->ctx, nh, 1); + break; + case 'V': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(14, fh_fastafs); + MD5_Update(&s->ctx, nv, 1); + break; + case 'v': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(14, fh_fastafs); + MD5_Update(&s->ctx, nv, 1); + break; + case 'N': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(15, fh_fastafs); + MD5_Update(&s->ctx, nn, 1); + break; + case 'n': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(15, fh_fastafs); + MD5_Update(&s->ctx, nn, 1); + break; + case '-': + /*if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + }*/ + + s->add_N(); + //MD5_Update(&s->ctx, nn, 1); + break; + + default: + throw std::runtime_error("[fasta_to_fourbit_fastafs] invalid chars in FASTA file"); + break; + } + } + } + } + } + fh_fasta.close(); + } + if(s != nullptr) { + s->finish_sequence(fh_fastafs);// finish last sequence + } + + // write index/footer + unsigned int index_file_position = (uint32_t) fh_fastafs.tellp(); + //std::cout << " index file pos: " + std::to_string(index_file_position) + " \n"; + char buffer[4 + 1]; + uint_to_fourbytes(buffer, (uint32_t) index.size()); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + + for(size_t i = 0; i < index.size(); i++) { + s = index[i]; + + // set and write flag + fastafs_sequence_flags fsf; + fsf.set_linear(); + fsf.set_iupec_nucleotide(); + fsf.set_complete(); + fh_fastafs << fsf.get_bits()[0]; + fh_fastafs << fsf.get_bits()[1]; + + // name + unsigned char name_size = (unsigned char) s->name.size(); + fh_fastafs.write((char *) &name_size, 1); // name size + fh_fastafs.write(s->name.c_str(), (size_t) s->name.size());// name + + // location of sequence data in file + uint_to_fourbytes(buffer, (uint32_t) s->file_offset_in_fasta); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + delete s; + } + fh_fastafs << "\x00"s;// no metadata tags (YET) + + // update header: set to updated + fh_fastafs.seekp(8, std::ios::beg); + ffsf.set_complete(); + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + + uint_to_fourbytes(buffer, index_file_position);//position of header + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + + fh_fasta.close(); + + + fh_fastafs.seekp(0, std::ios::end); + + //printf("file size now: %i\n", (uint32_t) fh_fastafs.tellp()); + + + + std::ifstream fh_fastafs_crc(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); + fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway + + uint32_t nnn = 0; + uint32_t iii; + + uLong crc = crc32(0L, Z_NULL, 0); + + bool terminate = false; + bool togo = true; + while(togo) { + if(!fh_fastafs_crc.read(buffer, 4)) { + terminate = true; + } + //printf("alive [%i]\n", fh_fastafs_crc.gcount()); + iii = (uint32_t) fh_fastafs_crc.gcount(); + crc = crc32(crc, (const Bytef*)& buffer, iii); + nnn += iii; + + if(terminate) { + togo = false; + } + } + + // -- + //printf("nnn = %i\n", nnn); + + + + //write crc as 4 bytes + char byte_enc[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(byte_enc, (uint32_t) crc); + //printf("[%i][%i][%i][%i] ~ %02hhx%02hhx%02hhx%02hhx \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3], + // byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); + + + + // calc written size + fh_fastafs.seekp(0, std::ios::end); + size_t written = fh_fastafs.tellp(); + + + //printf("file size now: %i\n", fh_fastafs.tellp()); + + fh_fastafs.close(); + + return written; +} + diff --git a/src/fasta_to_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp similarity index 78% rename from src/fasta_to_fastafs.cpp rename to src/fasta_to_twobit_fastafs.cpp index fc1a1b81..7e673171 100644 --- a/src/fasta_to_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -1,14 +1,25 @@ #include #include +#include "zlib.h" #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" +#include "flags.hpp" #include "utils.hpp" -void fasta_seq_header_conversion_data::add_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) +const static char nt[2] = "T"; +const static char nc[2] = "C"; +const static char na[2] = "A"; +const static char ng[2] = "G"; +const static char nn[2] = "N"; + + + +void fasta_seq_header_twobit_conversion_data::add_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) { this->twobit_data.set(twobit_byte::iterator_to_offset(this->n_actg), nucleotide);//0 = TU, 1 = @@ -25,7 +36,7 @@ void fasta_seq_header_conversion_data::add_ACTG(unsigned char nucleotide, std::o this->n_actg++; } -void fasta_seq_header_conversion_data::add_N() +void fasta_seq_header_twobit_conversion_data::add_N() { if(!this->previous_was_N) { this->n_block_starts.push_back(this->n_actg + this->N); @@ -37,7 +48,7 @@ void fasta_seq_header_conversion_data::add_N() -void fasta_seq_header_conversion_data::finish_sequence(std::ofstream &fh_fastafs) +void fasta_seq_header_twobit_conversion_data::finish_sequence(std::ofstream &fh_fastafs) { uint32_t j; @@ -106,33 +117,34 @@ void fasta_seq_header_conversion_data::finish_sequence(std::ofstream &fh_fastafs -const static char nt[2] = "T"; -const static char nc[2] = "C"; -const static char na[2] = "A"; -const static char ng[2] = "G"; -const static char nn[2] = "N"; - -size_t fasta_to_fastafs(const std::string fasta_file, const std::string fastafs_file) +size_t fasta_to_twobit_fastafs(const std::string &fasta_file, const std::string &fastafs_file) { - std::vector index; - fasta_seq_header_conversion_data* s; + std::vector index; + fasta_seq_header_twobit_conversion_data* s; + + fastafs_flags ffsf; + ffsf.set_incomplete(); // @todo use ifstream and ofstream argument types std::string line; std::ifstream fh_fasta(fasta_file.c_str(), std::ios :: in | std::ios :: binary); std::ofstream fh_fastafs(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); + s = nullptr; if(fh_fasta.is_open() and fh_fastafs.is_open()) { fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; - fh_fastafs << "\x00\x00"s;// the flag for now, set to INCOMPLETE as writing is in progress + + // the flag for now, set to INCOMPLETE as writing is in progress || spacer that will be overwritten later + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + fh_fastafs << "\x00\x00\x00\x00"s;// position of metedata ~ unknown YET // iterate until first sequence is found, ensuring we won't write to uninitialized sequences - s = nullptr; while(s == nullptr and getline(fh_fasta, line)) { if(line[0] == '>') { line.erase(0, 1);// erases first part, quicker would be pointer from first char - s = new fasta_seq_header_conversion_data(fh_fastafs.tellp(), line); + s = new fasta_seq_header_twobit_conversion_data(fh_fastafs.tellp(), line); fh_fastafs << "\x00\x00\x00\x00"s;// placeholder for sequence length index.push_back(s); } @@ -144,13 +156,14 @@ size_t fasta_to_fastafs(const std::string fasta_file, const std::string fastafs_ s->finish_sequence(fh_fastafs); line.erase(0, 1);// erases first part, quicker would be pointer from first char - s = new fasta_seq_header_conversion_data(fh_fastafs.tellp(), line); + s = new fasta_seq_header_twobit_conversion_data(fh_fastafs.tellp(), line); fh_fastafs << "\x00\x00\x00\x00"s;// number of 2bit encoded nucleotides, not yet known index.push_back(s); } else { for(std::string::iterator it = line.begin(); it != line.end(); ++it) { switch(*it) { + // keeping daling with upper-case and lower-case in separate cases is quicker than one if/else before the switch, simply beacuse switches are faster than if-statements. case 'U': case 'T': if(s->in_m_block) { @@ -254,8 +267,7 @@ size_t fasta_to_fastafs(const std::string fasta_file, const std::string fastafs_ MD5_Update(&s->ctx, nn, 1); break; default: - std::cerr << "invalid chars in FASTA file" << std::endl; - exit(1); + throw std::runtime_error("[fasta_to_twobit_fastafs] invalid chars in FASTA file"); break; } } @@ -277,8 +289,13 @@ size_t fasta_to_fastafs(const std::string fasta_file, const std::string fastafs_ for(size_t i = 0; i < index.size(); i++) { s = index[i]; - // flag - fh_fastafs << "\x00\x08"s; + // set and write flag + fastafs_sequence_flags fsf; + fsf.set_linear(); + fsf.set_dna(); + fsf.set_complete(); + fh_fastafs << fsf.get_bits()[0]; + fh_fastafs << fsf.get_bits()[1]; // name unsigned char name_size = (unsigned char) s->name.size(); @@ -294,17 +311,68 @@ size_t fasta_to_fastafs(const std::string fasta_file, const std::string fastafs_ // update header: set to updated fh_fastafs.seekp(8, std::ios::beg); - fh_fastafs << "\x00\x01"s; // updated flag + ffsf.set_complete(); + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; uint_to_fourbytes(buffer, index_file_position);//position of header fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); - // calc written size + fh_fasta.close(); + + // now calculate crc32 checksum, as all bits have been set. fh_fastafs.seekp(0, std::ios::end); - size_t written = fh_fastafs.tellp(); - fh_fasta.close(); + fastafs f(""); + f.load(fastafs_file); + uint32_t crc32c = f.get_crc32(); + + char byte_enc[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(byte_enc, (uint32_t) crc32c); + //printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); + + + /* + std::ifstream fh_fastafs_crc(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); + fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway + + uint32_t nnn = 0; + uint32_t iii; + + uLong crc = crc32(0L, Z_NULL, 0); + + bool terminate = false; + bool togo = true; + while(togo) + { + if(!fh_fastafs_crc.read(buffer, 4)) { + terminate = true; + } + //printf("alive [%i]\n", fh_fastafs_crc.gcount()); + iii = fh_fastafs_crc.gcount(); + crc = crc32(crc, (const Bytef*)& buffer, iii); + nnn += iii; + + if(terminate) { + togo = false; + } + }; + // -- + printf("nnn = %i\n",nnn); + + //write crc + uint_to_fourbytes(byte_enc, (uint32_t) crc); + printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); + */ + + // finalize file + size_t written = fh_fastafs.tellp(); fh_fastafs.close(); + + + return written; } diff --git a/src/fastafs.cpp b/src/fastafs.cpp index d786e02b..3b7de434 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +//#include #include #include @@ -25,11 +25,13 @@ #include #include #include +#include // crc32 #include "config.hpp" #include "twobit_byte.hpp" +#include "fourbit_byte.hpp" #include "fastafs.hpp" #include "utils.hpp" @@ -53,25 +55,28 @@ uint32_t fastafs_seq::fasta_filesize(uint32_t padding) { #if DEBUG if(padding == 0) { - throw std::invalid_argument("Padding is set to 0, should have been set to this->n elsewhere.\n"); + throw std::invalid_argument("[fastafs_seq::fasta_filesize] Padding is set to 0, should have been set to this->n elsewhere.\n"); } #endif // > chr \n ACTG NNN /number of newlines corresponding to ACTG NNN lines + return 1 + (uint32_t) this->name.size() + 1 + this->n + (this->n + (padding - 1)) / padding; } + + void fastafs_seq::view_fasta(ffs2f_init_seq* cache, std::ifstream *fh) { char buffer[READ_BUFFER_SIZE];// = new char [READ_BUFFER_SIZE]; uint32_t offset = 0; //@todo figure out if a do {} while() loop isn't more in place here? - uint32_t written = this->view_fasta_chunk_cached(cache, buffer, READ_BUFFER_SIZE, offset, fh); + uint32_t written = this->view_fasta_chunk(cache, buffer, READ_BUFFER_SIZE, offset, fh); while(written > 0) { std::cout << std::string(buffer, written); offset += written; - written = this->view_fasta_chunk_cached(cache, buffer, READ_BUFFER_SIZE, offset, fh); + written = this->view_fasta_chunk(cache, buffer, READ_BUFFER_SIZE, offset, fh); } } @@ -106,6 +111,7 @@ ffs2f_init_seq* fastafs_seq::init_ffs2f_seq(const uint32_t padding_arg, bool all data->n_starts[i] = fasta_header_size + this->n_starts[i] + (this->n_starts[i] / padding); data->n_ends[i] = fasta_header_size + this->n_ends[i] + (this->n_ends[i] / padding); } + block_size = data->n_starts.size(); data->n_starts[block_size - 1] = max_val; data->n_ends[block_size - 1] = max_val; @@ -125,8 +131,29 @@ ffs2f_init_seq* fastafs_seq::init_ffs2f_seq(const uint32_t padding_arg, bool all return data; } + + +// @todo templating like stuff +uint32_t fastafs_seq::view_fasta_chunk( + ffs2f_init_seq* cache, + char *buffer, + + size_t buffer_size, + off_t start_pos_in_fasta, + + std::ifstream *fh) +{ + if(this->flags.is_twobit()) { + return this->view_fasta_chunk_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); + } else { + return this->view_fasta_chunk_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); + } +} + + + /* - * fastafs_seq::view_fasta_chunk_cached - + * fastafs_seq::view_fasta_chunk - * * @padding = number of spaces? * @char buffer = @@ -139,7 +166,7 @@ ffs2f_init_seq* fastafs_seq::init_ffs2f_seq(const uint32_t padding_arg, bool all * * @todo see if this can be a std::ifstream or some kind of stream type of object? */ -uint32_t fastafs_seq::view_fasta_chunk_cached( +template uint32_t fastafs_seq::view_fasta_chunk_generalized( ffs2f_init_seq* cache, char *buffer, @@ -154,8 +181,10 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( } #endif //DEBUG + T t = T();// nice way of having this templated object on stack :) uint32_t written = 0; + if(written >= buffer_size) { // requesting a buffer of size=0, should throw an exception? return written; } @@ -203,21 +232,20 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( // when we are in an OPEN n block, we need to go to the first non-N base after, and place the file pointer there uint32_t n_passed = 0; this->get_n_offset(nucleotide_pos, &n_passed); - fh->seekg((uint32_t) this->data_position + 4 + ((nucleotide_pos - n_passed) / 4), fh->beg); + fh->seekg((uint32_t) this->data_position + 4 + ((nucleotide_pos - n_passed) / T::nucleotides_per_byte), fh->beg); /* 0 0 0 0 1 1 1 1 << desired offset from starting point A C T G A C T G * handigste is om file pointer naar de byte ervoor te zetten - vervolgens wanneer twobit_offset gelijk is aan nul, lees je de volgende byte + vervolgens wanneer bit_offset gelijk is aan nul, lees je de volgende byte * nooit out of bound */ - twobit_byte t = twobit_byte(); - const char *chunk = twobit_byte::twobit_hash[0]; - unsigned char twobit_offset = (nucleotide_pos - n_passed) % 4; - if(twobit_offset != 0) { + const char *chunk = T::encode_hash[0];// init + unsigned char bit_offset = (nucleotide_pos - n_passed) % T::nucleotides_per_byte;// twobit -> 4, fourbit: -> 2 + if(bit_offset != 0) { fh->read((char*)(&t.data), 1); chunk = t.get(); } @@ -237,23 +265,23 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( while(pos < pos_limit) {// while next sequence-containing-line is open if(pos >= cache->n_starts[n_block]) { if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case - buffer[written++] = 'n'; + buffer[written++] = T::n_fill_masked; } else { - buffer[written++] = 'N'; + buffer[written++] = T::n_fill_unmasked; } } else { - if(twobit_offset % 4 == 0) { + if(bit_offset % T::nucleotides_per_byte == 0) { fh->read((char*)(&t.data), 1); chunk = t.get(); } if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case - buffer[written++] = (unsigned char)(chunk[twobit_offset] + 32); + buffer[written++] = (unsigned char)(chunk[bit_offset] + 32); } else { - buffer[written++] = chunk[twobit_offset]; + buffer[written++] = chunk[bit_offset]; } - twobit_offset = (unsigned char)(twobit_offset + 1) % 4; + bit_offset = (unsigned char)(bit_offset + 1) % T::nucleotides_per_byte; } if(pos == cache->n_ends[n_block]) { n_block++; @@ -261,6 +289,7 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( if(pos == cache->m_ends[m_block]) { m_block++; } + pos++; if(written >= buffer_size) { @@ -274,19 +303,91 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( if(pos < pos_limit) { buffer[written++] = '\n'; pos++; + if(written >= buffer_size) { //fh->clear(); return written; } } + newlines_passed++; } + //fh->clear(); return written; } + +size_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_region* sr, std::ifstream *fh) +{ +#if DEBUG + if(cache == nullptr) { + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 01\n"); + } + + if(sr == nullptr) { + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 02\n"); + } + +#endif + + + size_t total_requested_size; + if(sr->has_defined_end) { + total_requested_size = std::min((size_t) this->n, (size_t) sr->end + 1); + } else { + total_requested_size = this->n; + } + + total_requested_size -= sr->start; + return total_requested_size; +} + +uint32_t fastafs_seq::view_sequence_region(ffs2f_init_seq* cache, sequence_region* sr, char *buffer, size_t size, off_t offset, std::ifstream *fh) +{ +#if DEBUG + if(cache == nullptr) { + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 01\n"); + } + + if(sr == nullptr) { + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 02\n"); + } + + if(size == 0) { // requestedsize must be larger than 0 + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 03\n"); + } + +#endif + + uint32_t written = 0; + + size_t total_requested_size; + if(sr->has_defined_end) { + total_requested_size = std::min((size_t) this->n, (size_t) sr->end + 1); + } else { + total_requested_size = this->n; + } + + total_requested_size -= sr->start; + total_requested_size -= offset; + total_requested_size = std::min(size, total_requested_size); + + written = (uint32_t) this->view_fasta_chunk( + cache, // ffs2f_init_seq* cache, + buffer, // char *buffer + (size_t) total_requested_size, // size_t buffer_size, + (off_t) 2 + this->name.size() + sr->start + offset, // offset is for chunked reading + fh + ); + + return written; +} + + + /* CRAM specification: @@ -336,15 +437,15 @@ std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) // half iteration remainder = this->n % chunk_size; if this number > 0; do it too for(uint32_t i = 0; i < n_iterations; i++) { - this->view_fasta_chunk_cached(cache, chunk, - chunksize, - header_offset + (i * chunksize), - fh); + this->view_fasta_chunk(cache, chunk, + chunksize, + header_offset + (i * chunksize), + fh); SHA1_Update(&ctx, chunk, chunksize); } if(remaining_bytes > 0) { - this->view_fasta_chunk_cached(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); + this->view_fasta_chunk(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); SHA1_Update(&ctx, chunk, remaining_bytes); //chunk[remaining_bytes] = '\0'; } @@ -361,6 +462,7 @@ std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) } + std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) { #if DEBUG @@ -386,18 +488,29 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) unsigned long n_iterations = (unsigned long) this->n / chunksize; signed int remaining_bytes = this->n % chunksize; + size_t written; // half iteration remainder = this->n % chunk_size; if this number > 0; do it too for(uint32_t i = 0; i < n_iterations; i++) { - this->view_fasta_chunk_cached(cache, chunk, - chunksize, - header_offset + (i * chunksize), - fh); - MD5_Update(&ctx, chunk, chunksize); + written = this->view_fasta_chunk(cache, chunk, + chunksize, + header_offset + (i * chunksize), + fh); + + if(this->flags.is_fourbit()) { + written = remove_chars(chunk, '-', written); + } + + MD5_Update(&ctx, chunk, written); } if(remaining_bytes > 0) { - this->view_fasta_chunk_cached(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); - MD5_Update(&ctx, chunk, remaining_bytes); + written = this->view_fasta_chunk(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); + + if(this->flags.is_fourbit()) { + written = remove_chars(chunk, '-', written); + } + + MD5_Update(&ctx, chunk, written); chunk[remaining_bytes] = '\0'; } @@ -414,19 +527,32 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) -uint32_t fastafs_seq::n_twobits() +uint32_t fastafs_seq::n_bits() { - // if n actg bits is: - // 0 -> 0 - // 1,2,3 and 4 -> 1 - uint32_t n = this->n; + uint32_t n = this->n;// number of characters + + // minus number of masked characters for(uint32_t i = 0; i < this->n_starts.size(); i++) { n -= n_ends[i] - this->n_starts[i] + 1; } - return (n + 3) / 4; + + // divided by bits per bytes + if(this->flags.is_twobit()) { + // if n actg bits is: + // 0 -> 0 + // 1,2,3 and 4 -> 1 + return (n + (twobit_byte::nucleotides_per_byte - 1)) / twobit_byte::nucleotides_per_byte; + } else if(this->flags.is_fourbit()) { + return (n + (fourbit_byte::nucleotides_per_byte - 1)) / fourbit_byte::nucleotides_per_byte; + } else { + return 0; // unclear yet + } } + + + //@brief calculates the number of paddings found in a sequence of length N with uint32_t fastafs_seq::n_padding(uint32_t offset, uint32_t position_until, uint32_t padding) { @@ -475,7 +601,7 @@ bool fastafs_seq::get_n_offset(uint32_t pos, uint32_t *num_Ns) fastafs::fastafs(std::string arg_name) : - name(arg_name) + name(arg_name), crc32f(0) { } @@ -524,7 +650,23 @@ void fastafs::load(std::string afilename) } } - this->flag = twobytes_to_uint(&memblock[8]); + this->flags.set(&memblock[8]); + if(this->flags.is_incomplete()) { + throw std::invalid_argument("Incomplete FASTAFS file (probably terminated during conversion): " + filename); + } + + /* + unsigned char bits; + unsigned char bits_per_byte; + if(this->flags.is_twobit()) { + bits = 2; + bits_per_byte = 4; + } + else { + bits = 4; + bits_per_byte = 2; + }*/ + std::streampos file_cursor = (std::streampos) fourbytes_to_uint(&memblock[10], 0); // INDEX @@ -539,7 +681,7 @@ void fastafs::load(std::string afilename) // flag file.read(memblock, 2); - s->flag = twobytes_to_uint(memblock); + s->flags.set(memblock);// should be initialized during construction of this class // name length file.read(memblock, 1); @@ -563,11 +705,16 @@ void fastafs::load(std::string afilename) s->n = fourbytes_to_uint(memblock, 0); // skip nucleotides - file.seekg((uint32_t) s->data_position + 4 + ((s->n + 3) / 4), file.beg); + if(s->flags.is_twobit()) { // there fit 4 twobits in a byte, thus divide by 4, + file.seekg((uint32_t) s->data_position + 4 + ((s->n + 3) / 4), file.beg); + } else if(s->flags.is_fourbit()) { // there fit 2 fourbits in a byte, thus divide by 2, + file.seekg((uint32_t) s->data_position + 4 + ((s->n + 1) / 2), file.beg); + } // N-blocks (and update this->n instantly) file.read(memblock, 4); uint32_t N_blocks = fourbytes_to_uint(memblock, 0); + s->n_starts.resize(N_blocks); s->n_ends.resize(N_blocks); for(j = 0; j < s->n_starts.size(); j++) { @@ -580,10 +727,12 @@ void fastafs::load(std::string afilename) s->n += s->n_ends[j] - s->n_starts[j] + 1; } - // MD5-checksum - file.read(memblock, 16); - for(int j = 0; j < 16 ; j ++) { - s->md5_digest[j] = memblock[j]; + // MD5-checksum - only if sequence is complete + if(s->flags.is_complete()) { + file.read(memblock, 16); + for(int j = 0; j < 16 ; j ++) { + s->md5_digest[j] = memblock[j]; + } } // M-blocks @@ -600,9 +749,22 @@ void fastafs::load(std::string afilename) s->m_ends[j] = fourbytes_to_uint(memblock, 0); } } + file.seekg(file_cursor, file.beg); this->data[i] = s; } + + // metadata section - empty for now + file.read(memblock, 1); + + // crc32 checksum - may be missing because fastafs::load is also used before fastafs::get_crc32 is ran to obtain the checksum + file.read(memblock, 4); + if(file.gcount() == 4) { + this->crc32f = fourbytes_to_uint(memblock, 0); + } else { + //printf("crc32 checksum missing\n"); + } + file.close(); delete[] memblock; } @@ -623,6 +785,7 @@ void fastafs::view_fasta(ffs2f_init* cache) for(uint32_t i = 0; i < this->data.size(); i++) { this->data[i]->view_fasta(cache->sequences[i], &file); } + file.close(); } } @@ -641,8 +804,81 @@ ffs2f_init* fastafs::init_ffs2f(uint32_t padding, bool allow_masking) return ddata; } + + + +// estimates the whole file size of a file such as "/seq/chr1:56-" +size_t fastafs::view_sequence_region_size(ffs2f_init* cache, const char *seq_region_arg) +{ +#if DEBUG + if(cache == nullptr) { + throw std::invalid_argument("fastafs::view_sequence_region - error 01\n"); + } + + if(cache->padding_arg != 0) { + throw std::invalid_argument("fastafs::view_sequence_region - error 02\n"); + } + + if(cache->sequences.size() == 0) { + throw std::invalid_argument("fastafs::view_sequence_region - error 03\n"); + } +#endif + + std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + if(file.is_open()) { + // parse "chr..:..-.." string + sequence_region sr = sequence_region(seq_region_arg); + + // 02 : check if 'chr' is equals this->data[i].name + for(size_t i = 0; i < this->data.size(); i++) { + if(sr.seq_name.compare(this->data[i]->name) == 0) { + return this->data[i]->view_sequence_region_size(cache->sequences[i], &sr, &file); + } + } + } + + return 0; +} + + + +uint32_t fastafs::view_sequence_region(ffs2f_init* cache, const char *seq_region_arg, char *buffer, size_t buffer_size, off_t file_offset) +{ +#if DEBUG + if(cache == nullptr) { + throw std::invalid_argument("fastafs::view_sequence_region - error 01\n"); + } + + if(cache->padding_arg != 0) { + throw std::invalid_argument("fastafs::view_sequence_region - error 02\n"); + } + + if(cache->sequences.size() == 0) { + throw std::invalid_argument("fastafs::view_sequence_region - error 03\n"); + } +#endif + + std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + if(file.is_open()) { + // parse "chr..:..-.." string + sequence_region sr = sequence_region(seq_region_arg); + + // 02 : check if 'chr' is equals this->data[i].name + for(size_t i = 0; i < this->data.size(); i++) { + if(sr.seq_name.compare(this->data[i]->name) == 0) { + return this->data[i]->view_sequence_region(cache->sequences[i], &sr, buffer, buffer_size, file_offset, &file); + } + } + } + + return 0; +} + + + + /* - * fastafs::view_fasta_chunk_cached - + * fastafs::view_fasta_chunk - * * @cache: * @buffer: @@ -651,12 +887,7 @@ ffs2f_init* fastafs::init_ffs2f(uint32_t padding, bool allow_masking) * * returns */ -uint32_t fastafs::view_fasta_chunk_cached( - ffs2f_init* cache, - char *buffer, - - size_t buffer_size, - off_t file_offset) +uint32_t fastafs::view_fasta_chunk(ffs2f_init* cache, char *buffer, size_t buffer_size, off_t file_offset) { uint32_t written = 0; std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); @@ -670,7 +901,7 @@ uint32_t fastafs::view_fasta_chunk_cached( const uint32_t sequence_file_size = seq->fasta_filesize(cache->padding_arg); if(pos < sequence_file_size) { - const uint32_t written_seq = seq->view_fasta_chunk_cached( + const uint32_t written_seq = seq->view_fasta_chunk( cache->sequences[i], &buffer[written], std::min((uint32_t) buffer_size - written, sequence_file_size), @@ -692,7 +923,7 @@ uint32_t fastafs::view_fasta_chunk_cached( } file.close(); } else { - throw std::runtime_error("[fastafs::view_fasta_chunk_cached] could not load fastafs: " + this->filename); + throw std::runtime_error("[fastafs::view_fasta_chunk] could not load fastafs: " + this->filename); } return written; } @@ -795,6 +1026,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi header_offset_previous++; } } + ffs2f_init* cache = this->init_ffs2f(0, false); // false, no masking needed, always upper-case is fine in this case for(i = 0; i < this->data.size(); i++) { sequence = this->data[i]; @@ -835,6 +1067,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi return written; } } + uint_to_fourbytes_ucsc2bit(n_seq, sequence->n_ends[k] - sequence->n_starts[k] + 1); pos_limit += 4; while(pos < pos_limit) { @@ -853,6 +1086,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi while(pos < pos_limit) { buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; + if(written >= buffer_size) { delete cache; return written; @@ -866,6 +1100,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi while(pos < pos_limit) { buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; + if(written >= buffer_size) { delete cache; return written; @@ -877,6 +1112,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi while(pos < pos_limit) { buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; + if(written >= buffer_size) { delete cache; return written; @@ -889,6 +1125,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi while(pos < pos_limit) { buffer[written++] = '\0'; pos++; + if(written >= buffer_size) { delete cache; return written; @@ -899,10 +1136,11 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi uint32_t full_twobits = sequence->n / 4; twobit_byte t; pos_limit += full_twobits; + while(pos < pos_limit) { //printf("%i - %i = %i || %i\n",pos_limit,pos, (full_twobits - (pos_limit - pos)) * 4, j); //sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), 4, &file); - sequence->view_fasta_chunk_cached(cache->sequences[i], n_seq, 4, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), &file); + sequence->view_fasta_chunk(cache->sequences[i], n_seq, 4, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), &file); t.set(n_seq); buffer[written++] = t.data; pos++; @@ -922,7 +1160,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi if(pos < pos_limit) { //printf("%i - %i = %i || %i :: %i == %i \n",pos_limit,pos, full_twobits * 4, j, sequence->n - (full_twobits * 4), sequence->n - j); //sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + full_twobits * 4, sequence->n - (full_twobits * 4), &file); - sequence->view_fasta_chunk_cached(cache->sequences[i], n_seq, sequence->n - (full_twobits * 4), sequence->name.size() + 2 + full_twobits * 4, &file); + sequence->view_fasta_chunk(cache->sequences[i], n_seq, sequence->n - (full_twobits * 4), sequence->name.size() + 2 + full_twobits * 4, &file); t.set(n_seq); buffer[written++] = t.data; pos++; @@ -936,7 +1174,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi delete cache; file.close(); } else { - throw std::runtime_error("[fastafs::view_fasta_chunk_cached] could not load fastafs: " + this->filename); + throw std::runtime_error("[fastafs::view_fasta_chunk] could not load fastafs: " + this->filename); } return written; } @@ -1081,6 +1319,40 @@ size_t fastafs::view_dict_chunk(char *buffer, size_t buffer_size, off_t file_off +//@todo add unit tests +size_t fastafs::fastafs_filesize(void) +{ + // header + n-sequences + size_t n = 4 + 4 + 2 + 4; + + // number sequences + n += 4; + + // per sequence + for(uint32_t i = 0; i < this->data.size(); i++) { + n += 2;// flags + n += 1; // name length + n += this->data[i]->name.size();// name + n += 4; // reference to compr. data + + // compr dataa + n += 4 + 4 + 4;// compressed nuc. + n blocks + m blocks + n += this->data[i]->n_bits(); + n += this->data[i]->n_starts.size() * 8; + n += 16;//md5 sum, always present? + n += this->data[i]->m_starts.size() * 8; + } + + // metadata + n += 1; // @ todo more sophi. + + // crc32 + n += 4; + + return n; +} + + size_t fastafs::fasta_filesize(uint32_t padding) { size_t n = 0; @@ -1089,6 +1361,7 @@ size_t fastafs::fasta_filesize(uint32_t padding) //if(file.is_open()) { // file.close(); + for(size_t i = 0; i < this->data.size(); i++) { n += this->data[i]->fasta_filesize(padding); } @@ -1193,6 +1466,9 @@ uint32_t fastafs::view_faidx_chunk(uint32_t padding, char *buffer, size_t buffer return written; } + + + /* https://www.ebi.ac.uk/ena/cram/sha1/7716832754e642d068e6fbd8f792821ca5544309 Hello message sent @@ -1244,11 +1520,26 @@ int fastafs::info(bool ena_verify_checksum) std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); if(file.is_open()) { - std::cout << "FASTAFS NAME: " << this->filename << "\n"; - printf("SEQUENCES: %u\n", (uint32_t) this->data.size()); + std::cout << "# FASTAFS NAME: " << this->filename << "\n"; + printf("# SEQUENCES: %u\n", (uint32_t) this->data.size()); for(uint32_t i = 0; i < this->data.size(); i++) { md5_digest_to_hash(this->data[i]->md5_digest, md5_hash); + + std::string compression_type; + if(this->data[i]->flags.is_twobit()) { + compression_type = "2bit"; + } else if(this->data[i]->flags.is_fourbit()) { + compression_type = "4bit"; + } else { + compression_type = "????"; + } + + + // print sequence name & size & checksum + printf("%-24s%-12i%s %s", this->data[i]->name.c_str(), this->data[i]->n, compression_type.c_str(), md5_hash); + + if(ena_verify_checksum) { //wget header of: //https://www.ebi.ac.uk/ena/cram/md5/ @@ -1259,7 +1550,6 @@ int fastafs::info(bool ena_verify_checksum) //struct sockadfiledr_in address; int sock = 0; struct sockaddr_in serv_addr; - //std::string hello2 = "GET /ena/cram/md5/" + std::string(md5_hash) + " HTTP/1.1\r\nHost: www.ebi.ac.uk\r\nConnection: Keep-Alive\r\n\r\n"; std::string hello2 = "GET /ena/cram/md5/" + std::string(md5_hash) + " HTTP/1.1\r\nHost: www.ebi.ac.uk\r\nConnection: Keep-Alive\r\n\r\n"; //char *hello = &hello2.c_str(); @@ -1311,15 +1601,15 @@ int fastafs::info(bool ena_verify_checksum) int NNvalread = SSL_read(ssl, buffer, 32); if(NNvalread < 0) { - printf(" >%-24s%-12i%s \n", this->data[i]->name.c_str(), this->data[i]->n, md5_hash); + printf(" "); } else if(std::string(buffer).find(" 200 ") != (size_t) -1) { // sequence is in ENA - printf(" >%-24s%-12i%s https://www.ebi.ac.uk/ena/cram/md5/%s\n", this->data[i]->name.c_str(), this->data[i]->n, md5_hash, md5_hash); + printf(" https://www.ebi.ac.uk/ena/cram/md5/%s", md5_hash); } else { - printf(" >%-24s%-12i%s ---\n", this->data[i]->name.c_str(), this->data[i]->n, md5_hash); + printf(" ---"); } - } else { - printf(" >%-24s%-12i%s\n", this->data[i]->name.c_str(), this->data[i]->n, md5_hash); } + + printf("\n"); } file.close(); } @@ -1328,13 +1618,95 @@ int fastafs::info(bool ena_verify_checksum) } -int fastafs::check_integrity() +// skips first four bytes and does not include crc32 at the end either +uint32_t fastafs::get_crc32(void) { if(this->filename.size() == 0) { throw std::invalid_argument("No filename found"); } - int retcode = 0; + // starts at 4th + size_t total_bytes_to_be_read = this->fastafs_filesize() - 4 - 4 ; + + uLong crc = crc32(0L, Z_NULL, 0); + + const int buffer_size = 4; + char buffer[buffer_size + 1]; + + size_t bytes_to_be_read_this_iter; + size_t bytes_actually_read_this_iter; + + // now calculate crc32 checksum, as all bits have been set. + std::ifstream fh_fastafs_crc(this->filename.c_str(), std::ios :: out | std::ios :: binary); + fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway + + while(total_bytes_to_be_read > 0) { + bytes_to_be_read_this_iter = std::min((size_t) buffer_size, total_bytes_to_be_read) ; + fh_fastafs_crc.read(buffer, bytes_to_be_read_this_iter); + total_bytes_to_be_read -= bytes_to_be_read_this_iter; + + bytes_actually_read_this_iter = fh_fastafs_crc.gcount(); + if(bytes_actually_read_this_iter == 0) { + total_bytes_to_be_read = 0; // unexpected eof? + } else { + crc = crc32(crc, (const Bytef*)& buffer, (uint32_t) bytes_actually_read_this_iter); + } + } + + return (uint32_t) crc; +} + + +//true = integer +//false = corrupt +bool fastafs::check_file_integrity(bool verbose) +{ + uint32_t crc32_current = this->get_crc32(); + + char buf_old[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(buf_old, (uint32_t) this->crc32f); + + if(crc32_current != this->crc32f) { + + char buf_new[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(buf_new, (uint32_t) crc32_current); + + if(verbose) { + printf("ERROR\t%02hhx%02hhx%02hhx%02hhx (encoded in fastafs) != %02hhx%02hhx%02hhx%02hhx (on disk)\n--\n", + (unsigned char) buf_old[0], + (unsigned char) buf_old[1], + (unsigned char) buf_old[2], + (unsigned char) buf_old[3], + + (unsigned char) buf_new[0], + (unsigned char) buf_new[1], + (unsigned char) buf_new[2], + (unsigned char) buf_new[3]); + } + } else { + if(verbose) { + printf("OK\t%02hhx%02hhx%02hhx%02hhx\n--\n", + (unsigned char) buf_old[0], + (unsigned char) buf_old[1], + (unsigned char) buf_old[2], + (unsigned char) buf_old[3]); + } + } + + return (crc32_current == this->crc32f); +} + + +//true = integer +//false = corrupt +bool fastafs::check_sequence_integrity(bool verbose) +{ + if(this->filename.size() == 0) { + throw std::invalid_argument("No filename found"); + } + + bool retcode = true; + char md5_hash[32 + 1] = ""; md5_hash[32] = '\0'; std::string old_hash; @@ -1347,23 +1719,22 @@ int fastafs::check_integrity() md5_digest_to_hash(this->data[i]->md5_digest, md5_hash); old_hash = std::string(md5_hash); - /* - * const uint32_t padding;// padding used for this sequence, cannot be 0 - const uint32_t total_sequence_containing_lines;// calculate total number of full nucleotide lines: (this->n + padding - 1) / padding - - std::vector n_starts; - std::vector n_ends; - * */ - std::string new_hash = this->data[i]->md5(cache->sequences[i], &file); if(old_hash.compare(new_hash) == 0) { - printf("OK\t%s\n", this->data[i]->name.c_str()); + if(verbose) { + printf("OK\t%s\n", this->data[i]->name.c_str()); + } } else { - printf("ERROR\t%s\t%s != %s\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); - retcode = EIO; + if(verbose) { + printf("ERROR\t%s\t%s (encoded in fastafs) != %s (on disk)\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); + } + + retcode = false; } } file.close(); + } else { + throw std::runtime_error("[fastafs::check_sequence_integrity] could not load fastafs: " + this->filename); } delete cache; @@ -1384,8 +1755,9 @@ uint32_t fastafs::n() } - +/* std::string fastafs::basename() { return ""; } +*/ diff --git a/src/flags.cpp b/src/flags.cpp new file mode 100644 index 00000000..e1ec3911 --- /dev/null +++ b/src/flags.cpp @@ -0,0 +1,161 @@ + + +#include + +#include "flags.hpp" + + + +twobit_flag::twobit_flag() +{ + // ensure all bits are set, this prevents unexpected or undefined behaviour + this->bits[0] = '\0'; + this->bits[1] = '\0'; +} + + + +void twobit_flag::set(char *data) +{ + this->bits[0] = data[0]; + this->bits[1] = data[1]; +} + + + +// https://www.learncpp.com/cpp-tutorial/bit-manipulation-with-bitwise-operators-and-bit-masks/ +bool twobit_flag::get_flag(unsigned char bit) +{ +#if DEBUG + if(bit >= 16) { + throw std::runtime_error("twobit_flag::get_flag = out of bound: " + std::to_string(bit) + "\n"); + } +#endif //DEBUG + + return (this->bits[bit / 8] & bitmasks[bit]); +} + + + +// https://www.learncpp.com/cpp-tutorial/bit-manipulation-with-bitwise-operators-and-bit-masks/ +void twobit_flag::set_flag(unsigned char bit, bool enable) +{ + if(bit >= 16) { + throw std::runtime_error("twobit_flag::set_flag = out of bound: " + std::to_string(bit) + "\n"); + } + + + if(enable) { // + //this->bits[bit / 8] |= bitmasks[bit]; + this->bits[bit / 8] = (unsigned char)(this->bits[bit / 8] | bitmasks[bit]); + } else { + //this->bits[bit / 8] &= ~bitmasks[bit]; + this->bits[bit / 8] = (unsigned char)(this->bits[bit / 8] & ~bitmasks[bit]); + } +} + + +std::array &twobit_flag::get_bits(void) +{ + return this->bits; +} + + + + +bool fastafs_flags::is_complete() +{ + return this->get_flag(FASTAFS_BITFLAG_COMPLETE); +} + +void fastafs_flags::set_complete() +{ + this->set_flag(FASTAFS_BITFLAG_COMPLETE, true); +} + +void fastafs_flags::set_incomplete() +{ + this->set_flag(FASTAFS_BITFLAG_COMPLETE, false); +} + + + + +// alphabet: 'ACTG' + 'N' +bool fastafs_sequence_flags::is_dna() +{ + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == false && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == false); +} + +// alphabet: 'ACUG' + 'N' +bool fastafs_sequence_flags::is_rna() +{ + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == true && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == false); +} + +// alphabet: 'ACGTURYKMSWBDHVN' + '-' +bool fastafs_sequence_flags::is_iupec_nucleotide() +{ + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == false && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == true); +} + +bool fastafs_sequence_flags::is_complete() +{ + return this->get_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE); +} + +bool fastafs_sequence_flags::is_circular() +{ + return this->get_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR); +} + + + + + + +void fastafs_sequence_flags::set_dna() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, false); // 0,0 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, false); +} + +void fastafs_sequence_flags::set_rna() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, true); // 1,0 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, false); +} + +void fastafs_sequence_flags::set_iupec_nucleotide() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, false); // 0,1 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, true); +} + +void fastafs_sequence_flags::set_complete() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE, true); +} + +void fastafs_sequence_flags::set_incomplete() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE, false); +} + +void fastafs_sequence_flags::set_linear() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR, false); +} + +void fastafs_sequence_flags::set_circular() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR, true); +} + + diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp new file mode 100644 index 00000000..399d790d --- /dev/null +++ b/src/fourbit_byte.cpp @@ -0,0 +1,248 @@ +#include +#include + +#include "config.hpp" + +#include "fourbit_byte.hpp" + + +/* + alphabet = ACGTURYKMSWBDHVN + +binary: IUPEC + +00000000 AA +00000001 AC +00000010 AG +... +00000010 NH +11111110 NV +11111111 NN + */ + +const char fourbit_byte::fourbit_alhpabet[17] = "ACGTURYKMSWBDHVN"; +const char fourbit_byte::encode_hash[256][3] = {"AA", "AC", "AG", "AT", "AU", "AR", "AY", "AK", "AM", "AS", "AW", "AB", "AD", "AH", "AV", "AN", "CA", "CC", "CG", "CT", "CU", "CR", "CY", "CK", "CM", "CS", "CW", "CB", "CD", "CH", "CV", "CN", "GA", "GC", "GG", "GT", "GU", "GR", "GY", "GK", "GM", "GS", "GW", "GB", "GD", "GH", "GV", "GN", "TA", "TC", "TG", "TT", "TU", "TR", "TY", "TK", "TM", "TS", "TW", "TB", "TD", "TH", "TV", "TN", "UA", "UC", "UG", "UT", "UU", "UR", "UY", "UK", "UM", "US", "UW", "UB", "UD", "UH", "UV", "UN", "RA", "RC", "RG", "RT", "RU", "RR", "RY", "RK", "RM", "RS", "RW", "RB", "RD", "RH", "RV", "RN", "YA", "YC", "YG", "YT", "YU", "YR", "YY", "YK", "YM", "YS", "YW", "YB", "YD", "YH", "YV", "YN", "KA", "KC", "KG", "KT", "KU", "KR", "KY", "KK", "KM", "KS", "KW", "KB", "KD", "KH", "KV", "KN", "MA", "MC", "MG", "MT", "MU", "MR", "MY", "MK", "MM", "MS", "MW", "MB", "MD", "MH", "MV", "MN", "SA", "SC", "SG", "ST", "SU", "SR", "SY", "SK", "SM", "SS", "SW", "SB", "SD", "SH", "SV", "SN", "WA", "WC", "WG", "WT", "WU", "WR", "WY", "WK", "WM", "WS", "WW", "WB", "WD", "WH", "WV", "WN", "BA", "BC", "BG", "BT", "BU", "BR", "BY", "BK", "BM", "BS", "BW", "BB", "BD", "BH", "BV", "BN", "DA", "DC", "DG", "DT", "DU", "DR", "DY", "DK", "DM", "DS", "DW", "DB", "DD", "DH", "DV", "DN", "HA", "HC", "HG", "HT", "HU", "HR", "HY", "HK", "HM", "HS", "HW", "HB", "HD", "HH", "HV", "HN", "VA", "VC", "VG", "VT", "VU", "VR", "VY", "VK", "VM", "VS", "VW", "VB", "VD", "VH", "VV", "VN", "NA", "NC", "NG", "NT", "NU", "NR", "NY", "NK", "NM", "NS", "NW", "NB", "ND", "NH", "NV", "NN"}; + + +/* + input \t output + 0 6 + 1 4 + 2 2 + 3 0 + + 4 6 + 5 4 + 6 2 + 7 0 + +not sure what the quickest way is - this way all calculations are done as ints, not as chars + + + */ +unsigned char fourbit_byte::iterator_to_offset(uint32_t iterator) +{ + if(iterator % 2 == 0) { + return 4; + } else { + return 0; + } +} + +// @todo, offset needs to be second parameter +void fourbit_byte::set(unsigned char bit_offset, unsigned char nucleotide) +{ + // bit_offset must be: {0, or 4}; -> location in bits + // nucleotides must be: + // => T - 00, C - 01, A - 10, G - 11 + // => T - 00, C - 1, A - 2, G - 3 +#if DEBUG + if(bit_offset != 0 and bit_offset != 4) { + throw std::invalid_argument("fourbit_byte(bit_offset, ..) must be 0 or 4\n"); + } +#endif //DEBUG + //set bit(s): INPUT |= 1 << N; + //unset bit(s): INPUT &= ~(1 << N); + + switch(nucleotide) { + case 0:// A (0000) + this->data = (unsigned char)(this->data & ~((8 + 4 + 2 + 1) << bit_offset)); // set zero's + break; + case 1:// C (0001) + this->data = (unsigned char)(this->data & ~((8 + 4 + 2) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((1) << bit_offset)); // set one's + break; + case 2:// G (0010) + this->data = (unsigned char)(this->data & ~((8 + 4 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((2) << bit_offset)); // set one's + break; + case 3:// T (0011) + this->data = (unsigned char)(this->data & ~((8 + 4) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((2 + 1) << bit_offset)); // set one's + break; + case 4:// U (0100) + this->data = (unsigned char)(this->data & ~((8 + 2 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((4) << bit_offset)); // set one's + break; + case 5:// R (0101) + this->data = (unsigned char)(this->data & ~((8 + 2) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((4 + 1) << bit_offset)); // set one's + break; + case 6:// Y (0110) + this->data = (unsigned char)(this->data & ~((8 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((4 + 2) << bit_offset)); // set one's + break; + case 7:// K (0111) + this->data = (unsigned char)(this->data & ~((8) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((4 + 2 + 1) << bit_offset)); // set one's + break; + case 8:// M (1000) + this->data = (unsigned char)(this->data & ~((4 + 2 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8) << bit_offset)); // set one's + break; + case 9:// S (1001) + this->data = (unsigned char)(this->data & ~((4 + 2) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 1) << bit_offset)); // set one's + break; + case 10:// W (1010) + this->data = (unsigned char)(this->data & ~((4 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 2) << bit_offset)); // set one's + break; + case 11:// B (1011) + this->data = (unsigned char)(this->data & ~((4) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 2 + 1) << bit_offset)); // set one's + break; + case 12:// D (1100) + this->data = (unsigned char)(this->data & ~((2 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 4) << bit_offset)); // set one's + break; + case 13:// H (1101) + this->data = (unsigned char)(this->data & ~((2) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 4 + 1) << bit_offset)); // set one's + break; + case 14:// V (1110) + this->data = (unsigned char)(this->data & ~((+1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 4 + 2) << bit_offset)); // set one's + break; + case 15:// N (1111) + this->data = (unsigned char)(this->data | ((8 + 4 + 2 + 1) << bit_offset)); // set one's + break; + +#if DEBUG + default: + throw std::invalid_argument("fourbit_byte::set(pos, nucleotide) invalid value\n"); + break; +#endif //DEBUG + } +}; + + +// input char "AACCCTTGG" +// N's are treated as 0, for some weird reason +void fourbit_byte::set(char* buffer) +{ + const std::array< unsigned char, 4> bit_offsets = {4, 0}; + for(unsigned char i = 0; i < 2; i++) { + switch(buffer[i]) { + + case 'A':// A (0000) + case 'a': + this->set(bit_offsets[i], 0); + break; + case 'C':// C (0001) + case 'c': + this->set(bit_offsets[i], 1); + break; + case 'G':// G (0010) + case 'g': + this->set(bit_offsets[i], 2); + break; + case 'T':// T (0011) + case 't': + this->set(bit_offsets[i], 3); + break; + case 'U':// U (0100) + case 'u': + this->set(bit_offsets[i], 4); + break; + case 'R':// R (0101) + case 'r': + this->set(bit_offsets[i], 5); + break; + case 'Y':// Y (0110) + case 'y': + this->set(bit_offsets[i], 6); + break; + case 'K':// K (0111) + case 'k': + this->set(bit_offsets[i], 7); + break; + case 'M':// M (1000) + case 'm': + this->set(bit_offsets[i], 8); + break; + case 'S':// S (1001) + case 's': + this->set(bit_offsets[i], 9); + break; + case 'W':// W (1010) + case 'w': + this->set(bit_offsets[i], 10); + break; + case 'B':// B (1011) + case 'b': + this->set(bit_offsets[i], 11); + break; + case 'D':// D (1100) + case 'd': + this->set(bit_offsets[i], 12); + break; + case 'H':// H (1101) + case 'h': + this->set(bit_offsets[i], 13); + break; + case 'V':// V (1110) + case 'v': + this->set(bit_offsets[i], 14); + break; + case 'N':// N (1111) + case 'n': + this->set(bit_offsets[i], 15); + break; + +#if DEBUG + default: + throw std::invalid_argument("fourbit_byte::set(char *) invalid value\n"); + break; +#endif //DEBUG + } + } +} + +/** + * @brief fully decodes a fourbit byte, not referencing to a hash but allocating a new char*, + * slower than fourbit_byte::get(void) but capable of determining very ends +**/ +char *fourbit_byte::get(unsigned char length) +{ +#if DEBUG + if(length > 2) { + throw std::invalid_argument("four_byte::get(unsigned char length) -> out of bound: " + std::to_string(length) + "\n"); + } +#endif //DEBUG + + char *seq = new char[length + 1]; + for(unsigned char i = 0; i < length; i++) { // length = 4: i = 0, 1, 2, 3 + seq[i] = fourbit_byte::encode_hash[this->data][i]; + } + seq[length] = '\0'; + + return seq; +} + + + +const char *fourbit_byte::get() +{ + return fourbit_byte::encode_hash[this->data]; +} diff --git a/src/fuse.cpp b/src/fuse.cpp index d71ceed6..ea71b29c 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -14,13 +14,14 @@ #include #include #include -#include +//#include #include "fuse.hpp" #include "database.hpp" #include "fastafs.hpp" #include "ucsc2bit.hpp" +#include "sequence_region.hpp" // http://www.maastaar.net/fuse/linux/filesystem/c/2016/05/21/writing-a-simple-filesystem-using-fuse/ @@ -32,6 +33,8 @@ struct fuse_instance { //fastasfs fastafs *f; ffs2f_init *cache; + ffs2f_init *cache_p0;// cache with padding of 0; used by API '/seq/chr1:123:456' + bool from_fastafs; // if false, from 2bit // ucsc2bit @@ -39,6 +42,7 @@ struct fuse_instance { // generic uint32_t padding; + bool allow_masking; int argc_fuse; }; @@ -66,6 +70,7 @@ static int do_getattr(const char *path, struct stat *st) st->st_atime = time(NULL); // The last "a"ccess of the file/directory is right now st->st_mtime = time(NULL); // The last "m"odification of the file/directory is right now + printf("[%s]\n", path); if(strcmp(path, "/") == 0) { //st->st_mode = S_IFREG | 0644; //st->st_nlink = 1; @@ -73,6 +78,20 @@ static int do_getattr(const char *path, struct stat *st) //directory st->st_mode = S_IFDIR | 0755; st->st_nlink = 2; // Why "two" hardlinks instead of "one"? The answer is here: http://unix.stackexchange.com/a/101536 + } else if(strlen(path) == 4 && strncmp(path, "/seq", 4) == 0) { + //directory + //printf("setting to DIR because /seq\n"); + st->st_mode = S_IFDIR | 0755; + st->st_nlink = 1; + } else if(strlen(path) > 4 && strncmp(path, "/seq/", 5) == 0) { + // API: "/seq/chr1:123-456" + printf("setting to FILE [%s] because /seq/...\n", path); + // @ todo - run a check on wether the chr exists and return err otherwise + st->st_mode = S_IFREG | 0644; + st->st_nlink = 1; + + //@todo this needs to be defined with some api stuff:!! + st->st_size = (signed int) ffi->f->view_sequence_region_size(ffi->cache_p0, (strchr(path, '/') + 5)); } else { st->st_mode = S_IFREG | 0644; st->st_nlink = 1; @@ -95,6 +114,9 @@ static int do_getattr(const char *path, struct stat *st) } else if(strcmp(path, virtual_dict_filename.c_str()) == 0) { st->st_size = ffi->f->dict_filesize(); } + //else if(strncmp(path, "/seq/", 5) == 0) { // api access + //printf("filesize: set to 4096\n"); + //} } } else { if(ffi->u2b != nullptr) { @@ -156,6 +178,10 @@ static int do_readdir(const char *path, void *buffer, fuse_fill_dir_t filler, of } } + if(strcmp(path, "/") == 0) { // If the user is trying to show the files/directories of the root directory show the following + filler(buffer, "seq", NULL, 0); // Directed indexed API access to subsequence "/seq/chr1:123-456 + } + return 0; } @@ -168,7 +194,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st time_t now = time(0); strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S.000", localtime(&now)); - static int written = -1; + static int written = -2;// -1 = permission deinied, -2 = missing file or directory if(ffi->from_fastafs) { printf("\033[0;32m[%s]\033[0;33m fastafs::do_read(\033[0msize=%u, offset=%u\033[0;33m):\033[0m %s \033[0;35m(fastafs: %s, padding: %u)\033[0m\n", cur_time, (uint32_t) size, (uint32_t) offset, path, ffi->f->name.c_str(), ffi->padding); @@ -178,14 +204,17 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st std::string virtual_ucsc2bit_filename = "/" + ffi->f->name + ".2bit"; std::string virtual_dict_filename = "/" + ffi->f->name + ".dict"; + printf("?? [[%s]]\n", path); if(strcmp(path, virtual_fasta_filename.c_str()) == 0) { - written = (signed int) ffi->f->view_fasta_chunk_cached(ffi->cache, buffer, size, offset); + written = (signed int) ffi->f->view_fasta_chunk(ffi->cache, buffer, size, offset); } else if(strcmp(path, virtual_faidx_filename.c_str()) == 0) { written = (signed int) ffi->f->view_faidx_chunk(ffi->padding, buffer, size, offset); } else if(strcmp(path, virtual_ucsc2bit_filename.c_str()) == 0) { written = (signed int) ffi->f->view_ucsc2bit_chunk(buffer, size, offset); } else if(strcmp(path, virtual_dict_filename.c_str()) == 0) { written = (signed int) ffi->f->view_dict_chunk(buffer, size, offset); + } else if(strncmp(path, "/seq/", 5) == 0) { // api access + written = (signed int) ffi->f->view_sequence_region(ffi->cache_p0, (strchr(path, '/') + 5), buffer, size, offset); } } else { if(ffi->u2b != nullptr) { @@ -337,6 +366,7 @@ void print_fuse_help() std::cout << "\n"; std::cout << "general options:\n"; std::cout << " -2 --2bit virtualise a 2bit file rather than FASTAFS UID\n"; + std::cout << " -m --no-masking Disable masking; bases in lower-case (not for 2bit output)\n"; std::cout << " -p ,--padding padding / FASTA line length\n"; std::cout << " -o opt,[opt...] mount options\n"; std::cout << " -h --help print help\n"; @@ -411,17 +441,32 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) //fastafs_fuse_instance *ffi = new fastafs_fuse_instance({nullptr, 60, 1, new char[argc]}); //fastafs_fuse_instance *ffi = new fastafs_fuse_instance({nullptr, 60, 0, nullptr}); - fuse_instance *fi = new fuse_instance({nullptr, nullptr, true, nullptr, 60, 0}); + + + fuse_instance *fi = new fuse_instance({ + nullptr, // pointer to fastafs decoder - if from_fasta is set to true + nullptr,// pointer to fastafs_init with defined padding + nullptr, // pointer to fastafs_init with cache size of 0 (for mounting ./seq/chr1:123-456 + true, // from fastafs + nullptr, // pointer to ucsc2bit decoder - if from_fasta is set to false + 60, // default_padding + true, // allow_masking + 0 // argc_fuse + }); //fuse option variable to send to fuse argv_fuse[fi->argc_fuse++] = (char *) "fastafs"; // becomes fuse.fastafs + printf("checkpoint a\n"); + std::vector fuse_options = {}; // those that need to be appended later char current_argument = '\0';// could be o for '-o', etc. + + std::vector full_args = {}; - for(unsigned int i = 0; i < argc; ++i) { + for(signed int i = 0; i < argc; ++i) { printf("processing argv[%i] = '%s' [current argument=%i]\n", i, argv[i], (int) current_argument); if(current_argument != '\0') { // parse the arguments' value @@ -447,6 +492,9 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) case '2': // a fastafs specific flag fi->from_fastafs = false; break; + case 'm': // disable masking + fi->allow_masking = false; + break; case 'f': // fuse specific flags case 's': @@ -460,7 +508,13 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) break; default: // argument, fastafs spcific (such as '-p' followed by '50') - current_argument = argv[i][1]; + if(strcmp(argv[i], "--2bit") == 0) { + fi->from_fastafs = false;; + } else if(strcmp(argv[i], "--no-masking") == 0) { + fi->allow_masking = false; + } else { + current_argument = argv[i][1]; + } break; } } else { @@ -468,8 +522,14 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) } } - if(full_args.size() >= 2) { + printf("checkpoint b\n"); + + + if(full_args.size() > 2) { + printf("checkpoint c\n"); + printf("full_args.size() = %u\n", (uint32_t) full_args.size()); int mount_target_arg = full_args[full_args.size() - 2 ]; // last two arguments are and , location to last 2 args not starting with --/- are in this vector + printf("out of bound???\n"); if(fi->from_fastafs) { database d = database(); @@ -491,9 +551,10 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) fi->f = new fastafs(name); fi->f->load(fname); - fi->cache = fi->f->init_ffs2f(fi->padding, true);// allow mixed case + fi->cache = fi->f->init_ffs2f(fi->padding, fi->allow_masking); + fi->cache_p0 = fi->f->init_ffs2f(0, true);// allow mixed case } else { - std::string basename = basename_cpp(std::string(argv[mount_target_arg])); + std::string basename = basename_cpp(std::string(argv[mount_target_arg])); //std::string basename = std::filesystem::path(std::string(argv[mount_target_arg])).filename(); fi->u2b = new ucsc2bit(basename);// useses basename as prefix for filenames to mount: hg19.2bit -> hg19.2bit.fa @@ -507,6 +568,8 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) } } + printf("checkpoint c\n"); + return fi; } @@ -515,11 +578,15 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) void fuse(int argc, char *argv[]) { + printf("wake up\n"); + // part 1 - rewrite args because "fastafs" "mount" is considered as two args, crashing fuse_init // - @todo at some point define that second mount is not really important? if possible char *argv2[argc]; fuse_instance *ffi = parse_args(argc, argv, argv2); + printf("checkpoint\n"); + // part 2 - print what the planning is char cur_time[100]; time_t now = time(0); diff --git a/src/lsfastafs.cpp b/src/lsfastafs.cpp index 9fd654a4..aae91711 100644 --- a/src/lsfastafs.cpp +++ b/src/lsfastafs.cpp @@ -1,4 +1,4 @@ -#include +//#include #include #include #include @@ -47,7 +47,6 @@ std::unordered_multimap > get_f fprintf(stdout, "Could not open /proc/mounts - are you sure this is running on linux?\n"); } do { - match = fscanf(f, "%255s %255s %255s %255s %d %d\n", mount_dev, mount_dir, mount_type, mount_opts, &mount_freq, &mount_passno); mount_dev[255] = 0; mount_dir[255] = 0; @@ -59,13 +58,12 @@ std::unordered_multimap > get_f strstr(mount_dir, arg) != NULL)) { std::string fn = std::string(mount_dir); - + std::string basename = basename_cpp(fn); //std::string basename = std::filesystem::path(fn).filename(); //std::cout << "basename: " << basename << "\n"; - - std::string dict_fn = std::string(mount_dir) + "/" + basename + ".dict"; + std::string dict_fn = std::string(mount_dir) + "/" + basename + ".dict"; if(getxattr(mount_dir, FASTAFS_FILE_XATTR_NAME.c_str(), xattr_fastafs_file, 255) != -1 && getxattr(mount_dir, FASTAFS_PID_XATTR_NAME.c_str(), xattr_fastafs_pid, 255) != -1 @@ -77,6 +75,7 @@ std::unordered_multimap > get_f } } + // else: line did not contain fastafs mount point } while(match != EOF); fclose(f); diff --git a/src/main.cpp b/src/main.cpp index b58bb48a..5033a5f8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,7 +6,8 @@ #include #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" +#include "fasta_to_fourbit_fastafs.hpp" #include "ucsc2bit_to_fastafs.hpp" #include "database.hpp" #include "fuse.hpp" @@ -39,6 +40,7 @@ void usage_view(void) std::cout << "View FASTAFS file in FASTA format" << std::endl << std::endl; std::cout << " -f, --file Provide fastafs by file path, not from database (cache)" << std::endl; std::cout << " -p, --padding Number of nucleotides before delimited with a newline [default=60]" << std::endl; + std::cout << " -m, --no-masking Disable masking; bases in lower-case (not for 2bit output)" << std::endl; std::cout << " -2, --2bit View in UCSC twoBit/2bit format" << std::endl; std::cout << " http://genome.ucsc.edu/FAQ/FAQformat.html#format7" << std::endl; std::cout << std::endl; @@ -67,8 +69,9 @@ void usage_check(void) void usage_cache(void) { - std::cout << "usage: " << PACKAGE << " cache \n\n"; - std::cout << "\n"; + std::cout << "usage: " << PACKAGE << " cache \n"; + std::cout << " cache -o \n\n"; + std::cout << " -o, --output-file Explicitly define fastafs output file and do not write to database (cache)\n"; } int main(int argc, char *argv[]) @@ -83,7 +86,7 @@ int main(int argc, char *argv[]) std::cout << PACKAGE << " v" << PACKAGE_VERSION << GIT_SHA1_STRING << "-release\n\n"; #endif //DEBUG - std::cout << "Copyright (C) 2017 Youri Hoogstrate." << "\n"; + std::cout << "Copyright (C) 2017 Dr. Youri Hoogstrate." << "\n"; std::cout << "License GPLv2+: GNU GPL version 2 or later .\n"; std::cout << "This is free software: you are free to change and redistribute it.\n"; std::cout << "There is NO WARRANTY, to the extent permitted by law.\n\n"; @@ -92,13 +95,37 @@ int main(int argc, char *argv[]) exit(0); } else if(strcmp(argv[1], "cache") == 0) { if(argc > 3) { - database d = database(); - std::string fname_out = d.add(argv[argc - 2]); + bool to_cache = true; + if(argc > 4 && strlen(argv[argc - 3]) >= 2) { + if( + (strcmp(argv[argc - 3], "-o") == 0) + or + (strcmp(argv[argc - 3], "--output-file") == 0) + + ) { + to_cache = false; + } + } - if(is_fasta_file(argv[argc - 1])) { - fasta_to_fastafs(argv[argc - 1], fname_out); + std::string fname_out; + if(to_cache) { + database d = database(); + fname_out = d.add(argv[argc - 2]); } else { + fname_out = std::string(argv[argc - 2]); + } + + if(is_fasta_file(argv[argc - 1])) { + try { + fasta_to_twobit_fastafs(argv[argc - 1], fname_out); + } catch(std::runtime_error& e) { + fasta_to_fourbit_fastafs(argv[argc - 1], fname_out); + } + } else if(is_ucsc2bit_file(argv[argc - 1])) { ucsc2bit_to_fastafs(argv[argc - 1], fname_out); + } else { + throw std::runtime_error("[main::cache] Invalid file format"); + return 1; } } else { usage_cache(); @@ -117,6 +144,7 @@ int main(int argc, char *argv[]) bool from_file = false; bool skip_argument = false; + bool allow_masking = true;// allow upper and lower case for(int i = 2; i < argc - 1; i++) { if(skip_argument) { @@ -124,6 +152,8 @@ int main(int argc, char *argv[]) } else { if(strcmp(argv[i], "-f") == 0 or strcmp(argv[i], "--file") == 0) { from_file = true; + } else if(strcmp(argv[i], "-m") == 0 or strcmp(argv[i], "--no-masking") == 0) { + allow_masking = false; } else if((strcmp(argv[i], "-p") == 0 or strcmp(argv[i], "--padding") == 0) and i + 1 < argc - 1) { try { sscanf(argv[++i], "%u", &padding); @@ -166,7 +196,7 @@ int main(int argc, char *argv[]) written = f.view_ucsc2bit_chunk(buffer, READ_BUFFER_SIZE, offset); } } else { - ffs2f_init* cache = f.init_ffs2f(padding, true); + ffs2f_init* cache = f.init_ffs2f(padding, allow_masking); f.view_fasta(cache);//@todo make argument parsing delete cache; @@ -253,7 +283,13 @@ int main(int argc, char *argv[]) fastafs f = fastafs(std::string(argv[argc - 1])); f.load(fname); - return f.check_integrity(); + bool check1 = f.check_file_integrity(true); + bool check2 = f.check_sequence_integrity(true); + if(check1 and check2) { + return 0; + } else { + return EIO; + } } else { usage_check(); } diff --git a/src/sequence_region.cpp b/src/sequence_region.cpp new file mode 100644 index 00000000..4f7a2015 --- /dev/null +++ b/src/sequence_region.cpp @@ -0,0 +1,98 @@ + +#include "sequence_region.hpp" + + + +sequence_region::sequence_region(char * seqstr) : + seq_name(""), has_defined_end(false), start(0), end(0) +{ + + parse((const char *) seqstr);// char* can be converted to cost char*, but not vice versa + +} + +sequence_region::sequence_region(const char * seqstr) : + seq_name(""), has_defined_end(false), start(0), end(0) +{ + + parse(seqstr); + +} + + +void sequence_region::parse(const char * seqstr) +{ + // the + 1 is the also allow parsing "sequence-of-size-255-...-:123-345" + size_t string_max_pos = std::min(MAX_SIZE_SEQ_NAME + 1, strlen(seqstr)); + size_t p = 0; + bool proceed = true; + for(size_t i = 0; i < string_max_pos && proceed == true; i++) { + if(seqstr[i] == ':') { + p = i; + proceed = false; + } + } + + if(p > 0) { + this->seq_name = std::string(seqstr, 0, p); + } else if(proceed == true) { + // either with string > 255 chars or string smaller than 255 without ':' + this->seq_name = std::string(seqstr, 0, string_max_pos); + } + + // chr1:1 + // p = 4 + // strlen = 6 + if(proceed == false and strlen(seqstr) > (p + 1)) { + // we can parse numbers + // find position of '-' character + + size_t p2 = 0; + bool proceed2 = true; + + for(size_t i = p; i < strlen(seqstr) && proceed2 == true; i++) { + if(seqstr[i] == '-') { + p2 = (size_t) i; + proceed2 = false; + } + } + + + if(proceed2 == true) { // chrA:123 + std::string start = std::string(seqstr, p + 1, p2 - p - 1); + + this->start = std::stoi(start); + + this->has_defined_end = true; + this->end = this->start; + } else if(p2 == (p + 1)) {// chrA:-123 + std::string end = std::string(seqstr, p2 + 1, strlen(seqstr) - p2 - 1); + + this->start = 0; + this->end = std::stoi(end) ; + + this->has_defined_end = true; + } else if(p2 > (p + 1)) { // chrA:123- | chrA:123-456 | chrA:123-456ERR + if(p2 + 1 == strlen(seqstr)) { // chrA:123- + std::string start = std::string(seqstr, p + 1, p2 - p - 1); + + this->start = std::stoi(start); + this->has_defined_end = false; + } else { // chrA:123-456 | chrA:123-456ERR + std::string start = std::string(seqstr, p + 1, p2 - p - 1); + std::string end = std::string(seqstr, p2 + 1, strlen(seqstr) - p2 - 1); + + + this->start = std::stoi(start) ; + + this->has_defined_end = true; + this->end = std::stoi(end) ; + } + } + + } + + if(this->has_defined_end and this->start > this->end) { + throw std::invalid_argument("Invalid region - start larger than end."); + } +} diff --git a/src/twobit_byte.cpp b/src/twobit_byte.cpp index b1cfb812..38f389a7 100644 --- a/src/twobit_byte.cpp +++ b/src/twobit_byte.cpp @@ -5,7 +5,7 @@ #include "twobit_byte.hpp" -const char twobit_byte::twobit_hash[256][5] = {"TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT", "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC", "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA", "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG", "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT", "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC", "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA", "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG", "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT", "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC", "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA", "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG", "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT", "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC", "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA", "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG", "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT", "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC", "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA", "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG", "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT", "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC", "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA", "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG", "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT", "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC", "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA", "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG", "GGGT", "GGGC", "GGGA", "GGGG"}; +const char twobit_byte::encode_hash[256][5] = {"TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT", "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC", "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA", "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG", "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT", "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC", "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA", "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG", "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT", "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC", "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA", "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG", "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT", "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC", "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA", "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG", "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT", "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC", "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA", "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG", "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT", "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC", "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA", "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG", "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT", "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC", "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA", "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG", "GGGT", "GGGC", "GGGA", "GGGG"}; /* @@ -48,18 +48,18 @@ void twobit_byte::set(unsigned char bit_offset, unsigned char nucleotide) // ??????00 // 11?? ~(3 << bit_offset) // data ???????? - this->data = (unsigned char)(this->data & ~(3 << bit_offset)); + this->data = (unsigned char)(this->data & ~((2 + 1) << bit_offset)); break; case 1://NUCLEOTIDE_C (01) - this->data = (unsigned char)(this->data & ~(2 << bit_offset)); - this->data = (unsigned char)(this->data | (1 << bit_offset)); + this->data = (unsigned char)(this->data & ~((2) << bit_offset)); + this->data = (unsigned char)(this->data | ((1) << bit_offset)); break; case 2://NUCLEOTIDE_A (10) - this->data = (unsigned char)(this->data & ~(1 << bit_offset)); - this->data = (unsigned char)(this->data | (2 << bit_offset)); + this->data = (unsigned char)(this->data & ~((1) << bit_offset)); + this->data = (unsigned char)(this->data | ((2) << bit_offset)); break; case 3://NUCLEOTIDE_G (11) - this->data = (unsigned char)(this->data | (nucleotide << bit_offset)); + this->data = (unsigned char)(this->data | ((2 + 1) << bit_offset)); break; #if DEBUG default: @@ -72,6 +72,7 @@ void twobit_byte::set(unsigned char bit_offset, unsigned char nucleotide) // input char "AACCCTTGG" // N's are treated as 0, for some weird reason +// this function seems specific for UCSC 2 bit format?! - if so, denote it like that void twobit_byte::set(char* buffer) { const std::array< unsigned char, 4> bit_offsets = {6, 4, 2, 0}; @@ -110,11 +111,19 @@ void twobit_byte::set(char* buffer) **/ char *twobit_byte::get(unsigned char length) { +#if DEBUG + if(length > 4) { + throw std::invalid_argument("twobit_byte::get(unsigned char length) -> out of bound: " + std::to_string(length) + "\n"); + } +#endif //DEBUG + char *seq = new char[length + 1]; + for(unsigned char i = 0; i < length; i++) { // length = 4: i = 0, 1, 2, 3 - seq[i] = twobit_byte::twobit_hash[this->data][i]; + seq[i] = twobit_byte::encode_hash[this->data][i]; } seq[length] = '\0'; + return seq; } @@ -122,5 +131,5 @@ char *twobit_byte::get(unsigned char length) const char *twobit_byte::get() { - return twobit_byte::twobit_hash[this->data]; + return twobit_byte::encode_hash[this->data]; } diff --git a/src/ucsc2bit.cpp b/src/ucsc2bit.cpp index 43f06ae2..e2d158c0 100644 --- a/src/ucsc2bit.cpp +++ b/src/ucsc2bit.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +//#include #include "config.hpp" @@ -96,7 +96,7 @@ uint32_t ucsc2bit_seq::view_fasta_chunk(uint32_t padding, char *buffer, size_t b fh->seekg((uint32_t) this->sequence_data_position + ((nucleotide_pos) / 4), std::ios::beg);// std::ios::beg | fh->beg twobit_byte t = twobit_byte(); - const char *chunk = twobit_byte::twobit_hash[0]; + const char *chunk = twobit_byte::encode_hash[0]; unsigned char twobit_offset = nucleotide_pos % 4; @@ -374,7 +374,7 @@ void ucsc2bit::load(std::string afilename) /* -* ucsc2bit::view_fasta_chunk_cached - +* ucsc2bit::view_fasta_chunk - * * @padding: size of padding - placement of newlines (default = 60) * @buffer: @@ -422,7 +422,7 @@ uint32_t ucsc2bit::view_fasta_chunk(uint32_t padding, char *buffer, size_t buffe file.close(); } else { - throw std::runtime_error("[ucsc2bit::view_fasta_chunk_cached] could not load ucsc2bit: " + this->filename); + throw std::runtime_error("[ucsc2bit::view_fasta_chunk] could not load ucsc2bit: " + this->filename); } return written; diff --git a/src/ucsc2bit_to_fastafs.cpp b/src/ucsc2bit_to_fastafs.cpp index de1f0e5b..87cb267c 100644 --- a/src/ucsc2bit_to_fastafs.cpp +++ b/src/ucsc2bit_to_fastafs.cpp @@ -4,6 +4,7 @@ #include "config.hpp" #include "ucsc2bit_to_fastafs.hpp" +#include "flags.hpp" #include "utils.hpp" @@ -29,6 +30,9 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) fastafs fs_new = fastafs(""); uint32_t i, j, n; + fastafs_flags ffsf; + ffsf.set_incomplete(); + ucsc2bit_seq_header *s; ucsc2bit_seq_header_conversion_data *t; @@ -39,7 +43,11 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) // Write header fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; - fh_fastafs << "\x00\x00"s;// the flag for now, set to INCOMPLETE as writing is in progress + + // the flag for now, set to INCOMPLETE as writing is in progress || spacer that will be overwritten later + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + fh_fastafs << "\x00\x00\x00\x00"s;// position of metedata ~ unknown YET // Read UCSC2bit header (n seq) @@ -111,7 +119,7 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) // parse and convert sequence fh_ucsc2bit.read(buffer, 4); twobit_byte t_in = twobit_byte(); - const char *decoded_in = t_in.twobit_hash[0];// unnecessary initialization but otherwise gcc whines + const char *decoded_in = t_in.encode_hash[0];// unnecessary initialization but otherwise gcc whines twobit_byte t_out = twobit_byte(); uint32_t k = 0; // iterator in fastafs format @@ -222,8 +230,13 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) s = data[i]; t = data2[i]; - // flag - fh_fastafs << "\x00\x08"s; + // set and write flag + fastafs_sequence_flags fsf; + fsf.set_linear(); + fsf.set_dna(); + fsf.set_complete(); + fh_fastafs << fsf.get_bits()[0]; + fh_fastafs << fsf.get_bits()[1]; // name fh_fastafs.write((char *) &s->name_size, (size_t) 1); // name size @@ -242,16 +255,32 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) // update header: set to updated fh_fastafs.seekp(8, std::ios::beg); - fh_fastafs << "\x00\x01"s; // updated flag + ffsf.set_complete(); + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + uint_to_fourbytes(buffer, index_file_position);//position of header fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); } + fh_ucsc2bit.close(); + fh_fastafs.seekp(0, std::ios::end); - size_t written = fh_fastafs.tellp(); + + fastafs f(""); + f.load(fastafs_file); + uint32_t crc32c = f.get_crc32(); + + char byte_enc[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(byte_enc, (uint32_t) crc32c); + //printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); + + + + size_t written = fh_fastafs.tellp(); fh_fastafs.close(); - fh_ucsc2bit.close(); return written; } diff --git a/src/utils.cpp b/src/utils.cpp index 446ad08c..f961a882 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -63,6 +63,20 @@ void uint_to_twobytes(char *chars, uint16_t n) +size_t remove_chars(char *s, int c, size_t l) +{ + size_t j = 0; + size_t n = l; + + for(size_t i = j = 0; i < n; i++) + if(s[i] != c) { + s[j++] = s[i]; + } + + s[j] = '\0'; + + return j; +} void uint_to_fourbytes(char *chars, uint32_t n) @@ -156,7 +170,7 @@ bool is_fasta_file(char *filename) FILE *fp; if((fp = fopen(filename, "rb")) == NULL) { - fclose(fp); + //fclose(fp); segfault if NULL throw std::runtime_error("Could not read first byte of putative FASTA file."); return false; } @@ -166,6 +180,37 @@ bool is_fasta_file(char *filename) return (buf[0] == '>');// return true if first byte equals > } else { fclose(fp); + + throw std::runtime_error("Could not read sufficient data."); + } + + return false; +} + + + +bool is_ucsc2bit_file(char *filename) +{ + char buf[4 + 1]; + FILE *fp; + + if((fp = fopen(filename, "rb")) == NULL) { + //fclose(fp); segfault if NULL + throw std::runtime_error("Could not read first byte of putative FASTA file."); + return false; + } + + if(fread(buf, 1, 4, fp) == 4) { + fclose(fp); + return ( + buf[0] == UCSC2BIT_MAGIC[0] and + buf[1] == UCSC2BIT_MAGIC[1] and + buf[2] == UCSC2BIT_MAGIC[2] and + buf[3] == UCSC2BIT_MAGIC[3] + );// return true if first byte equals > + } else { + fclose(fp); + throw std::runtime_error("Could not read sufficient data."); } @@ -175,29 +220,31 @@ bool is_fasta_file(char *filename) // https://www.systutorials.com/241216/how-to-get-the-directory-path-and-file-name-from-a-absolute-path-in-c-on-linux/ // https://stackoverflow.com/questions/38456127/what-is-the-value-of-cplusplus-for-c17 - THEN use std::filesystem::path(filename).filename(); -std::string basename_cpp(std::string fn) { - char* ts = strdup(fn.c_str()); - - //char* dir = dirname(ts1); - char* filename = basename(ts); - //std::string filenamepp = std::string(filename); +std::string basename_cpp(std::string fn) +{ + char* ts = strdup(fn.c_str()); + + //char* dir = dirname(ts1); + char* filename = basename(ts); + //std::string filenamepp = std::string(filename); - //printf("basename: [%s]\n", filename); - //std::cout << "basenamepp: |" << filenamepp << "|\n"; - - return std::string(filename); + //printf("basename: [%s]\n", filename); + //std::cout << "basenamepp: |" << filenamepp << "|\n"; + + std::string filename_cpp = std::string(filename); + //delete[] ts; + //delete[] filename; // deleting these affects the std::string somehow + + return filename_cpp; } // https://www.linuxquestions.org/questions/programming-9/how-to-get-the-full-path-of-a-file-in-c-841046/ // https://stackoverflow.com/questions/38456127/what-is-the-value-of-cplusplus-for-c17 - THEN use std::filesystem::canonical(filename) -std::string realpath_cpp(std::string fn) { - //std::string out = "asd"; - char *path = realpath(fn.c_str(), NULL); - //printf("realpath: [%s]\n", path); - - //std::string realpathpp = std::string(path); - //std::cout << "realpath: |" << realpathpp << "|\n"; - - return std::string(path); +std::string realpath_cpp(std::string fn) +{ + char buf[1024]; + realpath(fn.c_str(), buf); + + return std::string(buf); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a647f085..d3604c30 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -12,40 +12,56 @@ else() endif() -add_definitions(-std=c++17) +add_definitions(-std=c++14) set(BUILD_DIR "../bin") set(BUILD_TEST_DIR "${BUILD_DIR}/test") -add_executable(test_cache cache/test_cache.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_view view/test_view.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -#add_executable(test_tree tree/test_tree.cpp) -add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_fourbit_byte fourbit_byte/test_fourbit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_cache_twobit cache/test_cache_twobit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_cache_fourbit cache/test_cache_fourbit.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_flags flags/test_flags.cpp ../src/flags.cpp ../src/utils.cpp) +add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_check check/test_check.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_sequenceregion sequenceregion/test_sequenceregion.cpp ../src/sequence_region.cpp) add_executable(test_utils utils/test_utils.cpp ../src/utils.cpp) +#add_executable(test_tree tree/test_tree.cpp) add_test(test_ucsc2bit_to_fasta "${BUILD_TEST_DIR}/test_ucsc2bit_to_fasta") -set_target_properties(test_cache +set_target_properties(test_cache_twobit + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_cache_fourbit PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_view PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") -#set_target_properties(test_tree -# PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_flags + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_check + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_fastafs_as_ucsc2bit PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_twobit_byte PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_fourbit_byte + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_ucsc2bit_to_fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_ucsc2bit_as_fasta PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_sequenceregion + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_utils PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +#set_target_properties(test_tree +# PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") + diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp new file mode 100644 index 00000000..d71e396e --- /dev/null +++ b/test/cache/test_cache_fourbit.cpp @@ -0,0 +1,226 @@ +#define BOOST_TEST_MODULE fastfs_cache_fourbit + +#include + +#include "config.hpp" + +#include "fasta_to_fourbit_fastafs.hpp" +//#include "fastafs.hpp" + + + +BOOST_AUTO_TEST_SUITE(Testing) + +/** + * @brief + * + * @test + */ +BOOST_AUTO_TEST_CASE(test_equality_fourbit_byte) +{ + fourbit_byte b = fourbit_byte(); + + char *seq1; + char *seq2; + + const char *seq;// don't dereference, pointer to static four_bit property + + // test 0000 0000 -> 00000000 -> 0 + b.set(4, 0);// A => 0 + b.set(0, 0); + BOOST_CHECK_EQUAL(b.data, 0); + + seq1 = b.get(1); + seq2 = b.get(2); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "A"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "AA"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "AA"), 0); + + delete[] seq1; + delete[] seq2; + + // test 11 10 11 11 -> 00001100 -> 239 + b.set(4, 14); // V: 14 + b.set(0, 15); // N: 15 + BOOST_CHECK_EQUAL(b.data, 239); + + seq1 = b.get(1); + seq2 = b.get(2); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "V"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "VN"), 0); + + delete[] seq1; + delete[] seq2; + + // GT: 0010 0011 + b.set(4, 2); // G + b.set(0, 3); // T + BOOST_CHECK_EQUAL(b.data, 35); + + seq1 = b.get(1); + seq2 = b.get(2); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "G"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "GT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "GT"), 0); + + delete[] seq1; + delete[] seq2; + + + // set to UR (0100 0101) + b.set(4, 4); + b.set(0, 5); + BOOST_CHECK_EQUAL(b.data, 69); + + seq1 = b.get(1); + seq2 = b.get(2); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "U"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "UR"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "UR"), 0); + + delete[] seq1; + delete[] seq2; + + + // set to AN (0000 1111) + b.set(4, 0); + b.set(0, 15); + BOOST_CHECK_EQUAL(b.data, 15); + + seq1 = b.get(1); + seq2 = b.get(2); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "A"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "AN"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "AN"), 0); + + delete[] seq1; + delete[] seq2; + + + // set to NA (1111 0000) + b.set(4, 15); + b.set(0, 0); + BOOST_CHECK_EQUAL(b.data, 240); + + seq1 = b.get(1); + seq2 = b.get(2); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "N"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "NA"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "NA"), 0); + + delete[] seq1; + delete[] seq2; +} + + + +/** + * @brief + * + * @test tests whether a fourbit object is indeed stored as a single byte + */ +BOOST_AUTO_TEST_CASE(Test_size) +{ + fourbit_byte b = fourbit_byte(); + BOOST_CHECK_EQUAL(sizeof(b.data), 1); +} + + + + +BOOST_AUTO_TEST_CASE(test_cache) +{ + size_t written = fasta_to_fourbit_fastafs("test/data/test_004.fa", "tmp/test_004.fastafs"); + + static std::string reference = + // GENERIC-HEADER + "\x0F\x0A\x46\x53"s// [0, 3] + "\x00\x00\x00\x00"s// [4, 7] version + "\x80\x00"s// [8, 9] FASTAFS flag [ 00000000 | 00000001 ] + "\x00\x00\x00\x68"s // [10, 13] index position in file (104?) + + // DATA + "\x00\x00\x00\x4B"s// [14, 17] seq length (75) + "\xFB\x70\xD8\xC1\x4A\x29\x6E\x35\xD2\xAE"s// [18, 27] sequence (four bit format; n chars = 76/2 = 38) + "\x48\x3B\x9C\xFB\x26\x0C\xFD\x98\x43\x51"s// [28, 37] + "\x7A\xE9\xBD\xEC\xF5\x32\x61\x87\xA4\x00"s// [38, 47] + "\xE3\x9C\x7F\xB4\x2A\x8D\x65\x10"s// [48, 56] + "\x00\x00\x00\x02"s// [, ] n-blocks (2) + "\x00\x00\x00\x1B"s// [, ] n-block[0] starts (27) + "\x00\x00\x00\x4D"s// [, ] n-block[1] starts (77) + "\x00\x00\x00\x24"s// [, ] n-block[0] ends (36|37) + "\x00\x00\x00\x4F"s// [, ] n-block[1] ends (79) + "\x4A\x4D\x43\xFF\x09\x08\x29\xCD\x05\x9A\x08\x3C\x48\x3F\xEB\x3C"s// [76, ] checksum + "\x00\x00\x00\x01"s// [92, ] m-blocks (1) + "\x00\x00\x00\x35"s// [96, ] m-block starts (53) + "\x00\x00\x00\x44"s// [100, ] m-block starts (68) + + // INDEX + "\x00\x00\x00\x01"s // [104, ] 1 sequences + "\x50\x00" // [, ] complete, IUPEC [01010000] + "\x05"s "IUPAC"s // [, ] name + "\x00\x00\x00\x0E"s // [, ] data position in file (14) + + // METADATA + "\x00"s // [120] no metadata fields [padding will come soon?] + + // CRC32 + "\x3d\xbf\x6e\xbf"s + ; + + BOOST_CHECK_EQUAL(written, 125); + + //BOOST_CHECK(output.compare(uppercase) == 0 or output.compare(mixedcase) == 0); + std::ifstream file("tmp/test_004.fastafs", std::ios::in | std::ios::binary | std::ios::ate); + BOOST_REQUIRE(file.is_open()); + + std::streampos size; + char * buffer; + size = file.tellg(); + buffer = new char [size]; + + file.seekg(0, std::ios::beg); + file.read(buffer, size); + BOOST_CHECK_EQUAL(file.gcount(), size); + file.close(); + + //BOOST_CHECK_UNEQUAL(ret, -1); + + + for(unsigned int i = 0; i < size; i++) { + BOOST_CHECK_EQUAL(buffer[i], reference[i]); + + /* + if(reference[i] != buffer[i]) { + printf("comparing char %u\n", i); + printf(" ** mismatch [%d] [ref] %d != [buf] %d (%c x %02hhX)\n", i, reference[i], buffer[i], buffer[i], buffer[i]); + } + */ + + } + + delete[] buffer; + + + + // check fastafs filesize + fastafs f = fastafs(""); + f.load("tmp/test_004.fastafs"); + BOOST_CHECK_EQUAL(f.fastafs_filesize(), 125); +} + + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/cache/test_cache.cpp b/test/cache/test_cache_twobit.cpp similarity index 89% rename from test/cache/test_cache.cpp rename to test/cache/test_cache_twobit.cpp index 44279c1b..1a493b5e 100644 --- a/test/cache/test_cache.cpp +++ b/test/cache/test_cache_twobit.cpp @@ -1,11 +1,11 @@ -#define BOOST_TEST_MODULE fastfs_cache +#define BOOST_TEST_MODULE fastfs_cache_twobit #include #include "config.hpp" -//#include "twobit_byte.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" +//#include "fastafs.hpp" @@ -224,21 +224,21 @@ BOOST_AUTO_TEST_CASE(Test_size) */ BOOST_AUTO_TEST_CASE(test_cache) { - size_t written = fasta_to_fastafs("test/data/test.fa", "tmp/test_cachce_test.fastafs"); + size_t written = fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cache_test.fastafs"); static std::string reference = // GENERIC-HEADER "\x0F\x0A\x46\x53"s// [0, 3] "\x00\x00\x00\x00"s// [4, 7] version - "\x00\x01"s// [8, 9] FASTAFS flag [ 00000000 | 00000001 ] + "\x80\x00"s// [8, 9] FASTAFS flag [ 10000000 | 00000000 ] "\x00\x00\x01\x37"s // [10, 13] index position in file (153) // DATA "\x00\x00\x00\x10"s// [14, 17] seq length (16) (of 2bit encoded bytes; n-blocks are excluded) "\x00\x55\xAA\xFF"s// [18, 21] sequence "\x00\x00\x00\x00"s// [22, 25] n-blocks (0) - "\x75\x25\x5C\x6D\x90\x77\x89\x99\xAD\x36\x43\xA2\xE6\x9D\x43\x44"s// [26, 45] checksum - "\x00\x00\x00\x01"s// [46, 49] m-blocks (1) + "\x75\x25\x5C\x6D\x90\x77\x89\x99\xAD\x36\x43\xA2\xE6\x9D\x43\x44"s// [26, 41] checksum + "\x00\x00\x00\x01"s// [42, ] m-blocks (1) "\x00\x00\x00\x00"s// [50, 53] m-block starts (0) "\x00\x00\x00\x0F"s// [54, 57] m-block starts (15) "\x00\x00\x00\x0C"s// [58, 61] seq length (12) (of 2bit encoded bytes; n-blocks are excluded) @@ -292,36 +292,39 @@ BOOST_AUTO_TEST_CASE(test_cache) // INDEX "\x00\x00\x00\x07"s // [339, 342] 7 sequences - "\x00\x08" // [343, 344] complete, DNA and not circular + "\x010\x00" // [343, 344] complete, DNA and not circular "\x04"s "chr1"s // [345, 349] name "\x00\x00\x00\x0E"s // [350, 353] data position in file (14) - "\x00\x08" // [354, 355] complete, DNA and not circular + "\x010\x00" // [354, 355] complete, DNA and not circular "\x04"s "chr2"s // [356, 360] name "\x00\x00\x00\x36"s // [361, 364] data position in file (54) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x06"s "chr3.1"s // [, ] name "\x00\x00\x00\x65"s // [, ] data position in file (101) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x06"s "chr3.2"s // [, ] name "\x00\x00\x00\x8D"s // [, ] data position in file (141) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x06"s "chr3.3"s // [, ] name "\x00\x00\x00\xB5"s // [, ] data position in file (181) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x04"s "chr4"s // [, ] name "\x00\x00\x00\xDD"s // [, ] data position in file (221) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x04"s "chr5"s // [, ] name "\x00\x00\x01\x0A"s // [, ] data position in file (290) // METADATA - "\x00" // [399] no metadata fields [padding will come soon?] + "\x00"s // [399] no metadata fields [padding will come soon?] + + // CRC32 checksums + "\x1e\x77\x77\x22"s ; - BOOST_CHECK_EQUAL(written, 399); + BOOST_CHECK_EQUAL(written, 403); //BOOST_CHECK(output.compare(uppercase) == 0 or output.compare(mixedcase) == 0); - std::ifstream file("tmp/test_cachce_test.fastafs", std::ios::in | std::ios::binary | std::ios::ate); + std::ifstream file("tmp/test_cache_test.fastafs", std::ios::in | std::ios::binary | std::ios::ate); BOOST_REQUIRE(file.is_open()); std::streampos size; @@ -345,6 +348,12 @@ BOOST_AUTO_TEST_CASE(test_cache) } delete[] buffer; + + + // check computed file size + fastafs f = fastafs(""); + f.load("tmp/test_cache_test.fastafs"); + BOOST_CHECK_EQUAL(f.fastafs_filesize(), 403); } @@ -358,11 +367,11 @@ BOOST_AUTO_TEST_CASE(test_cache) BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) { // generate FASTAFS file from FASTA file - fasta_to_fastafs("test/data/test.fa", "tmp/test_cachce_test.fastafs"); + fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cache_test.fastafs"); // load the FASTAFS file fastafs f2 = fastafs("test"); - f2.load("tmp/test_cachce_test.fastafs"); + f2.load("tmp/test_cache_test.fastafs"); const uint32_t padding = 60; ffs2f_init* cache_p60_uc = f2.init_ffs2f(padding, false); // upper case @@ -377,7 +386,7 @@ BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) std::string output = ""; while(written < f2.fasta_filesize(padding)) { - w = f2.view_fasta_chunk_cached(cache_p60_uc, buffer, write_size, written); + w = f2.view_fasta_chunk(cache_p60_uc, buffer, write_size, written); output.append(buffer, w); written += w; } @@ -392,7 +401,7 @@ BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) output = ""; while(written < f2.fasta_filesize(padding)) { - w = f2.view_fasta_chunk_cached(cache_p60_mc, buffer, write_size, written); + w = f2.view_fasta_chunk(cache_p60_mc, buffer, write_size, written); output.append(buffer, w); written += w; } @@ -418,11 +427,11 @@ BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) BOOST_AUTO_TEST_CASE(test_cache_with_newlines) { // generate FASTAFS file from FASTA file - fasta_to_fastafs("test/data/test_003.fa", "tmp/test_cachce_test_003.fastafs"); + fasta_to_twobit_fastafs("test/data/test_003.fa", "tmp/test_cache_test_003.fastafs"); // load the FASTAFS file fastafs f2 = fastafs("test"); - f2.load("tmp/test_cachce_test_003.fastafs"); + f2.load("tmp/test_cache_test_003.fastafs"); const uint32_t padding = 60; ffs2f_init* cache_p60 = f2.init_ffs2f(padding, false); @@ -435,7 +444,7 @@ BOOST_AUTO_TEST_CASE(test_cache_with_newlines) std::string output = ""; while(written < f2.fasta_filesize(padding)) { - w = f2.view_fasta_chunk_cached(cache_p60, buffer, write_size, written); + w = f2.view_fasta_chunk(cache_p60, buffer, write_size, written); output.append(buffer, w); written += w; } diff --git a/test/check/test_check.cpp b/test/check/test_check.cpp new file mode 100644 index 00000000..16eb33a3 --- /dev/null +++ b/test/check/test_check.cpp @@ -0,0 +1,138 @@ +#define BOOST_TEST_MODULE fastfs_check + +#include + +#include "config.hpp" + +#include "fasta_to_twobit_fastafs.hpp" +#include "fasta_to_fourbit_fastafs.hpp" + + + +BOOST_AUTO_TEST_SUITE(Testing) + +/** + * @brief + * + * @test + */ +BOOST_AUTO_TEST_CASE(test_file_integrity) +{ + fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cache_test.fastafs"); + + // check computed file size + fastafs f = fastafs(""); + f.load("tmp/test_cache_test.fastafs"); + BOOST_REQUIRE_EQUAL(f.fastafs_filesize(), 403); + + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), true); + BOOST_CHECK_EQUAL(f.check_file_integrity(false), true); + + static const int i_min = 5; + static const int i_max = 403 - 5 - 1 - 1; + + for(int i = i_min; i < i_max ; i ++) { + char buffer[400]; + + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + std::ifstream fh_fastafs_in("tmp/test_cache_test.fastafs", std::ios::out | std::ios::binary); + std::ofstream fh_fastafs_out(tmp_file, std::ios::out | std::ios::binary); + + fh_fastafs_in.read(buffer, i); + fh_fastafs_out.write(reinterpret_cast(&buffer), i); + + // modify the i-th base to something else + fh_fastafs_in.read(buffer, 1); + if(buffer[0] == '\x01') { + buffer[0] = '\x02'; + } else { + buffer[0] = '\x01'; + } + fh_fastafs_out.write(reinterpret_cast(&buffer), 1); + + fh_fastafs_in.read(buffer, 403 - i - 1); + fh_fastafs_out.write(reinterpret_cast(&buffer), 403 - i - 1); + + fh_fastafs_in.close(); + fh_fastafs_out.close(); + + + fastafs f = fastafs(""); + f.filename = tmp_file; // don't load + BOOST_CHECK_EQUAL(f.check_file_integrity(false), false); + } + + + for(int i = 18; i <= 21 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + for(int i = 58; i <= 60 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + for(int i = 113; i <= 116 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + for(int i = 157; i <= 160 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + for(int i = 201; i <= 204 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + { + int i = 245; + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + { + int i = 294; + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + + + for(int i = i_min; i < i_max ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + remove(tmp_file.c_str()); + } + +} + + + + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/data/test_004.fa b/test/data/test_004.fa new file mode 100644 index 00000000..689dda8a --- /dev/null +++ b/test/data/test_004.fa @@ -0,0 +1,4 @@ +>IUPAC +NBKAHMDCUWGSYVTRHGWVUMTBSDN----- +-----BGYADNHSMUTRCKWVsbhvdnrtgyc +mkwuaAVTSDKNB---UGWMHYRC diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index c329ee6b..ca5e12fc 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -8,7 +8,7 @@ #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "fastafs.hpp" @@ -36,7 +36,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); BOOST_REQUIRE(fs.data.size() > 0); @@ -61,11 +61,11 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size) uint32_t ret; char chunk[4]; for(uint32_t i = 0; i < 23; i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p100->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p100->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(ret, 1); } for(uint32_t i = 23; i < 23 + 5; i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p100->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p100->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(ret, 0); } @@ -75,7 +75,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size) std::string ref = ">chr1\nttttccccaaaagggg\n"; for(uint32_t i = 0; i < ref.size(); i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p23->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p23->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(chunk[0], ref[i]); // test for '>' BOOST_CHECK_EQUAL(ret, 1); } @@ -95,7 +95,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); BOOST_REQUIRE(fs.data.size() > 0); @@ -115,13 +115,13 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0) std::string ref = ">chr1\nttttccccaaaagggg\n"; for(uint32_t i = 0; i < ref.size(); i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p0->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(chunk[0], ref[i]); // test for '>' BOOST_CHECK_EQUAL(ret, 1); } // check if out of bound query returns 0 - ret = fs.data[0]->view_fasta_chunk_cached(cache_p0->sequences[0], chunk, 1, ref.size(), &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, ref.size(), &file); BOOST_CHECK_EQUAL(ret, 0); file.close(); @@ -134,7 +134,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0__no_masking) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -156,13 +156,13 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0__no_masking) std::string ref = ">chr1\nTTTTCCCCAAAAGGGG\n"; for(uint32_t i = 0; i < ref.size(); i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p0->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(chunk[0], ref[i]); // test for '>' BOOST_CHECK_EQUAL(ret, 1); } // check if out of bound query returns 0 - ret = fs.data[0]->view_fasta_chunk_cached(cache_p0->sequences[0], chunk, 1, ref.size(), &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, ref.size(), &file); BOOST_CHECK_EQUAL(ret, 0); file.close(); @@ -175,7 +175,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -196,7 +196,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_md5) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -226,7 +226,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1b) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test_002.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test_002.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -235,7 +235,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1b) std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); BOOST_REQUIRE(file.is_open()); - BOOST_CHECK_EQUAL(fs.check_integrity(), 0); + BOOST_CHECK_EQUAL(fs.check_sequence_integrity(false), true); } @@ -353,7 +353,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs__dict_virtualization) */ std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -380,6 +380,510 @@ BOOST_AUTO_TEST_CASE(test_fastafs__dict_virtualization) // } } +/** + * @description tests reading a request like "chr1:123-456" etc. + */ +BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) +{ + std::string fastafs_file = "tmp/test.fastafs"; + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); + + fastafs fs = fastafs("test"); + fs.load(fastafs_file); + + BOOST_REQUIRE(fs.data.size() > 0); + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:0"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 't'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:3"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 't'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:4"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'c'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:15"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'g'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:16"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 0); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 0); + //BOOST_CHECK_EQUAL(buffer[0], '\n'); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:0"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:1"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'C'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:7"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'G'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:8"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:9"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:10"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:11"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:12"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:15"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'G'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:14-15"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 2); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'T'); + BOOST_CHECK_EQUAL(buffer[1], 'G'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:14-99999"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 2); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'T'); + BOOST_CHECK_EQUAL(buffer[1], 'G'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:16"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 0); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 0); + //BOOST_CHECK_EQUAL(buffer[0], 'G'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr4"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 8); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 8); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + BOOST_CHECK_EQUAL(buffer[1], 'C'); + BOOST_CHECK_EQUAL(buffer[2], 'T'); + BOOST_CHECK_EQUAL(buffer[3], 'G'); + BOOST_CHECK_EQUAL(buffer[4], 'n'); + BOOST_CHECK_EQUAL(buffer[5], 'n'); + BOOST_CHECK_EQUAL(buffer[6], 'n'); + BOOST_CHECK_EQUAL(buffer[7], 'n'); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr4:4-"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 4); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + BOOST_CHECK_EQUAL(buffer[1], 'n'); + BOOST_CHECK_EQUAL(buffer[2], 'n'); + BOOST_CHECK_EQUAL(buffer[3], 'n'); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr4:-1";// from left to 1: <0,1] + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 2); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + BOOST_CHECK_EQUAL(buffer[1], 'C'); + //BOOST_CHECK_EQUAL(buffer[2], 'T'); + //BOOST_CHECK_EQUAL(buffer[3], 'G'); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr3.1:1-2"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 2); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'C'); + BOOST_CHECK_EQUAL(buffer[1], 'T'); + } + + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr3.3"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 0); // small buffer size + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + BOOST_CHECK_EQUAL(buffer[1], 'C'); + BOOST_CHECK_EQUAL(buffer[2], 'T'); + BOOST_CHECK_EQUAL(buffer[3], 'G'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 4); // small buffer size + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + BOOST_CHECK_EQUAL(buffer[1], 'C'); + BOOST_CHECK_EQUAL(buffer[2], 'T'); + BOOST_CHECK_EQUAL(buffer[3], 'G'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 8); // small buffer size + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(buffer[0], 'a'); + BOOST_CHECK_EQUAL(buffer[1], 'a'); + BOOST_CHECK_EQUAL(buffer[2], 'a'); + BOOST_CHECK_EQUAL(buffer[3], 'a'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 12); // small buffer size + BOOST_CHECK_EQUAL(written, 3); + BOOST_CHECK_EQUAL(buffer[0], 'c'); + BOOST_CHECK_EQUAL(buffer[1], 'c'); + BOOST_CHECK_EQUAL(buffer[2], 'c'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr5:2-5"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 4); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 2); // small buffer size + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'T'); + BOOST_CHECK_EQUAL(buffer[1], 'G'); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chrDOESNOTEXIST"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 0); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); // small buffer size + BOOST_CHECK_EQUAL(written, 0); + } + +} + diff --git a/test/fastafs/test_ucsc2bit.cpp b/test/fastafs/test_ucsc2bit.cpp index 09049c03..e11836d7 100644 --- a/test/fastafs/test_ucsc2bit.cpp +++ b/test/fastafs/test_ucsc2bit.cpp @@ -128,7 +128,7 @@ NNACTG #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "fastafs.hpp" @@ -153,7 +153,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_view_chunked_2bit) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); // 2. load fastafs fastafs fs = fastafs("test"); @@ -400,7 +400,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_view_chunked_2bit_with_offset) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); BOOST_REQUIRE(fs.data.size() > 0); diff --git a/test/flags/test_flags.cpp b/test/flags/test_flags.cpp new file mode 100644 index 00000000..51e9a2a8 --- /dev/null +++ b/test/flags/test_flags.cpp @@ -0,0 +1,200 @@ +#define BOOST_TEST_MODULE flags + +#include + +#include "config.hpp" + +#include "flags.hpp" + + +//#include +//#include + + +BOOST_AUTO_TEST_SUITE(Testing) + + +BOOST_AUTO_TEST_CASE(test_fastafs_flags) +{ + fastafs_flags f; + + char buffer[2 + 1]; + buffer[2] = '\0'; + + // test: 00000000 00000000 + buffer[0] = '\x0'; + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + + + // test: 10000000 00000000 + buffer[0] = '\x80'; // worked with writing to file and checking with `xxd -b file` ~ this is binary equivalent to 10000000 + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + + + // test: 11111111 00000000 + buffer[0] = '\xFF'; + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + + // test: 00000001 00000000 + buffer[0] = '\x01'; + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + + + + // re-test: 00000000 00000000 + buffer[0] = '\x0'; + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + + f.set_complete(); + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + f.set_complete(); + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + f.set_complete(); + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + + f.set_incomplete(); + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + f.set_incomplete(); + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + f.set_incomplete(); + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); +} + + +BOOST_AUTO_TEST_CASE(test_fastafs_sequence_flags) +{ + fastafs_sequence_flags fs; + + fs.set_dna(); + fs.set_rna(); + fs.set_iupec_nucleotide(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), true); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), false); + + + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + fs.set_rna(); + fs.set_iupec_nucleotide(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), true); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), false); + + + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + fs.set_dna(); + fs.set_iupec_nucleotide(); + fs.set_rna(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), true); + + + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + fs.set_dna(); + fs.set_iupec_nucleotide(); + fs.set_rna(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), true); + + + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); + BOOST_CHECK_EQUAL(fs.is_dna(), true); + BOOST_CHECK_EQUAL(fs.is_rna(), false); + + + fs.set_linear(); + BOOST_CHECK_EQUAL(fs.is_linear(), true); + BOOST_CHECK_EQUAL(fs.is_circular(), false); + + fs.set_circular(); + fs.set_circular(); + fs.set_linear(); + BOOST_CHECK_EQUAL(fs.is_linear(), true); + BOOST_CHECK_EQUAL(fs.is_circular(), false); + + fs.set_linear(); + fs.set_linear(); + fs.set_circular(); + BOOST_CHECK_EQUAL(fs.is_linear(), false); + BOOST_CHECK_EQUAL(fs.is_circular(), true); + + + fs.set_complete(); + BOOST_CHECK_EQUAL(fs.is_complete(), true); + BOOST_CHECK_EQUAL(fs.is_incomplete(), false); + + fs.set_incomplete(); + fs.set_incomplete(); + fs.set_complete(); + BOOST_CHECK_EQUAL(fs.is_complete(), true); + BOOST_CHECK_EQUAL(fs.is_incomplete(), false); + + fs.set_complete(); + fs.set_complete(); + fs.set_incomplete(); + BOOST_CHECK_EQUAL(fs.is_complete(), false); + BOOST_CHECK_EQUAL(fs.is_incomplete(), true); + + + + // get characters + fs.set_incomplete(); + fs.set_linear(); + fs.set_dna(); + + std::array bits = fs.get_bits(); + BOOST_CHECK_EQUAL(bits[0], '\0'); + BOOST_CHECK_EQUAL(bits[1], '\0'); + + + fs.set_complete(); + fs.set_circular(); + fs.set_iupec_nucleotide(); + + bits = fs.get_bits(); + BOOST_CHECK_EQUAL(bits[0], '\x58');// 1011000 + BOOST_CHECK_EQUAL(bits[1], '\0'); + +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/fourbit_byte/test_fourbit_byte.cpp b/test/fourbit_byte/test_fourbit_byte.cpp new file mode 100644 index 00000000..831bf828 --- /dev/null +++ b/test/fourbit_byte/test_fourbit_byte.cpp @@ -0,0 +1,118 @@ +#define BOOST_TEST_MODULE fourbit_byte + +#include + +#include "config.hpp" + +#include "fourbit_byte.hpp" + + +BOOST_AUTO_TEST_SUITE(Testing) + + +BOOST_AUTO_TEST_CASE(test_fourbit_conversions) +{ + char seq[3]; + const char* seq_get; + seq[2] = '\0'; + fourbit_byte t; + + seq[0] = 'A'; + seq[1] = 'C'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 1); + + seq[0] = 'G'; + seq[1] = 'T'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 35); + + seq[0] = 'U'; + seq[1] = 'R'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 69); + + seq[0] = 'Y'; + seq[1] = 'K'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 103); + + seq[0] = 'M'; + seq[1] = 'S'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 137); + + seq[0] = 'W'; + seq[1] = 'B'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 171); + + seq[0] = 'D'; + seq[1] = 'H'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 205); + + seq[0] = 'V'; + seq[1] = 'N'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 239); + +} + +BOOST_AUTO_TEST_CASE(test_fourbit_static_offset_conversion_test) +{ + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(0), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(1), 0); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(2), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(3), 0); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(4), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(5), 0); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(6), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(7), 0); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(8), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(9), 0); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(10), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(11), 0); +} + + + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp new file mode 100644 index 00000000..5513761b --- /dev/null +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -0,0 +1,259 @@ +#define BOOST_TEST_MODULE sequence_region + +#include + +#include "config.hpp" + +#include "sequence_region.hpp" + + +//#include +//#include + + +BOOST_AUTO_TEST_SUITE(Testing) + +/* + * Goal is to parse the following strings: + * "chr1" + * "chr1:" + * "chr1:123" # single base + * "chr1:123-" + * "chr1:123-456" + * "chr1:-456" == "chr1:0-456" + + * "chr1:123-456:asdasd" error + */ + + +BOOST_AUTO_TEST_CASE(test_sequence_region) +{ + { + char arg[] = "/seq/chr1"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chr1:"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chr1:123"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.start, 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 123); + } + + { + char arg[] = "/seq/chr1:-123"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.start, 0); + + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 123); + } + + { + char arg[] = "/seq/chr1:123-456"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.start, 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 456); + } + + + { + char arg[] = "/seq/chr1:123-"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.start, 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end, false); + //BOOST_CHECK_EQUAL(sr.end , 456); - underfined + } + + { + char arg[] = "/seq/chr1:456-123"; + + sequence_region *sr = nullptr; + if(sr == nullptr) {// compiler doesn't understand this otherwise + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); + } + } + +} + + + +BOOST_AUTO_TEST_CASE(test_sequence_region3) +{ + { + char arg[] = "/seq/chrRr1"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chrRr1:"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chrRr1:1234"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 1234); + + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 1234); + } + + { + char arg[] = "/seq/chrRr1:-1234"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 0); + + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 1234); + } + + { + char arg[] = "/seq/chrRr1:1234-1235"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 1234); + + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 1235); + } + + + { + char arg[] = "/seq/chrRr1:1234-"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 1234); + + BOOST_CHECK_EQUAL(sr.has_defined_end, false); + //BOOST_CHECK_EQUAL(sr.end , 1235); - underfined + } + + { + + sequence_region *sr = nullptr; + //if(sr == nullptr) {// compiler doesn't understand this otherwise + char arg[] = "/seq/chrRr1:1235-1234"; + + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); + //} + } + +} + + + + +BOOST_AUTO_TEST_CASE(test_sequence_region2) +{ + { + char arg[] = "/seq/chrRr1"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chrRr1:"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chrRr1:123"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 123); + } + + { + char arg[] = "/seq/chrRr1:-123"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 0); + + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 123); + } + + { + char arg[] = "/seq/chrRr1:123-456"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 456); + } + + + { + char arg[] = "/seq/chrRr1:123-"; + sequence_region sr = sequence_region(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end, false); + //BOOST_CHECK_EQUAL(sr.end , 456); - underfined + } + + { + char arg[] = "/seq/chrRr1:456-123"; + + sequence_region *sr = nullptr; + //if(sr == nullptr) {// compiler doesn't understand this otherwise + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); + //} + } + +} + + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/ucsc2bit/test_ucsc2bit_as_fasta.cpp b/test/ucsc2bit/test_ucsc2bit_as_fasta.cpp index 98103767..4b7a6e93 100644 --- a/test/ucsc2bit/test_ucsc2bit_as_fasta.cpp +++ b/test/ucsc2bit/test_ucsc2bit_as_fasta.cpp @@ -9,7 +9,7 @@ #include "config.hpp" #include "ucsc2bit.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "fastafs.hpp" @@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE(test_ucsc2bit_to_fasta_file) // 1: FASTA to FASTAFS file: std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); BOOST_REQUIRE(fs.data.size() > 0); diff --git a/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp b/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp index 36361cab..e29175b9 100644 --- a/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp +++ b/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp @@ -5,7 +5,7 @@ #include "config.hpp" #include "utils.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "fastafs.hpp" #include "ucsc2bit_to_fastafs.hpp" @@ -21,8 +21,8 @@ BOOST_AUTO_TEST_CASE(test_ucsc2bit_to_fasta) std::string fastafs_file2 = "tmp/test.regenerated.fastafs"; std::string ucsc2bit_file = "tmp/test.2bit"; - // 01 fasta_to_fastafs() - fasta_to_fastafs("test/data/test.fa", fastafs_file); + // 01 fasta_to_twobit_fastafs() + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); // 02 load fastafs fastafs fs = fastafs("test"); @@ -52,7 +52,7 @@ BOOST_AUTO_TEST_CASE(test_ucsc2bit_to_fasta) std::istream_iterator b2(ifs2), e2; BOOST_CHECK_EQUAL_COLLECTIONS(b1, e1, b2, e2); - BOOST_CHECK_EQUAL(written, (size_t) 399); + BOOST_CHECK_EQUAL(written, (size_t) 403); } diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index 4bce700a..64237da2 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -8,7 +8,8 @@ #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" +#include "fasta_to_fourbit_fastafs.hpp" #include "fastafs.hpp" @@ -139,7 +140,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_twobit_offset_calc) std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -186,7 +187,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) std::string fasta_file = "test/data/" + test_name + ".fa"; std::string fastafs_file = "tmp/" + test_name + ".fastafs"; - fasta_to_fastafs(fasta_file, fastafs_file); + fasta_to_twobit_fastafs(fasta_file, fastafs_file); fastafs fs = fastafs(test_name); fs.load(fastafs_file); @@ -201,7 +202,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) // padding: 4 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 0); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 0); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG @@ -210,7 +211,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 999 - longer than longest seq - written = fs.view_fasta_chunk_cached(cache_p999, buffer, 100, 0); + written = fs.view_fasta_chunk(cache_p999, buffer, 100, 0); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTTCCCCAAAAGGGG >chr2 ACTGACTGNNNNACTG >chr3.1 ACTGACTGAAAAC >chr3.2 ACTGACTGAAAACC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG @@ -219,7 +220,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 5 - see if 2bit works - written = fs.view_fasta_chunk_cached(cache_p5, buffer, 100, 0); + written = fs.view_fasta_chunk(cache_p5, buffer, 100, 0); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTTC CCCAA AAGGG G >chr2 ACTGA CTGNN NNACT G >chr3.1 ACTGA CTGAA AAC >chr3.2 ACTGA CTGAA AACC >chr3.3 ACTGA CTGAA AACCC >chr4 ACTGN NNN >chr5 NNACT G @@ -228,7 +229,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 0); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 0); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -237,7 +238,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 1 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 1); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 1); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -246,7 +247,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 2 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 2); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 2); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -255,7 +256,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 3 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 3); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 3); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -264,7 +265,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 4 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 4); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 4); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -273,7 +274,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 5 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 5); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 5); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -282,7 +283,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 6 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 6); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 6); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG @@ -291,7 +292,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 7 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 7); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 7); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG @@ -300,7 +301,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 8 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 8); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 8); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG @@ -309,7 +310,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 9 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 9); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 9); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG @@ -318,7 +319,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 10 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 10); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 10); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG //XXXXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| @@ -331,7 +332,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) for(uint32_t offset = 0; offset < 62; ++offset) { std::string substr_file = full_file.substr(offset, 100); - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, offset); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, offset); std_buffer = std::string(buffer, substr_file.size()); BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << offset); @@ -348,6 +349,8 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) delete cache_p999; } + + BOOST_AUTO_TEST_CASE(test_chunked_viewing_sub) { uint32_t written; @@ -355,7 +358,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_sub) std::string fasta_file = "test/data/" + test_name + ".fa"; std::string fastafs_file = "tmp/" + test_name + ".fastafs"; - fasta_to_fastafs(fasta_file, fastafs_file); + fasta_to_twobit_fastafs(fasta_file, fastafs_file); fastafs fs = fastafs(test_name); fs.load(fastafs_file); @@ -373,7 +376,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_sub) //[>] [c] [h] [r] [3] [.] [1] [\n] [A] [C] [T] [G] [A] [C] [T] [G] [A] [A] [A] [A] [C] [\n] BOOST_CHECK_EQUAL(fs.data[2]->fasta_filesize(100), 22); - written = fs.data[2]->view_fasta_chunk_cached(cache_p100->sequences[2], buffer, 100, 0, &fh); + written = fs.data[2]->view_fasta_chunk(cache_p100->sequences[2], buffer, 100, 0, &fh); BOOST_CHECK_EQUAL(written, 22); std::string std_buffer = std::string(buffer, written); @@ -395,10 +398,12 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing2) std::string fasta_file = "test/data/" + test_name + ".fa"; std::string fastafs_file = "tmp/" + test_name + ".fastafs"; - fasta_to_fastafs(fasta_file, fastafs_file); + fasta_to_twobit_fastafs(fasta_file, fastafs_file); fastafs fs = fastafs(test_name); fs.load(fastafs_file); + BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); + uint32_t written; char *buffer = new char[2110];// file size on disk is 2108 bytes flush_buffer(buffer, 2110, '\0'); @@ -436,13 +441,12 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing2) [..] [.] [.] - */ for(uint32_t start_pos = 0; start_pos < full_file.size(); start_pos++) { for(uint32_t buffer_len = (uint32_t) full_file.size() - start_pos; buffer_len > 0; buffer_len--) { std::string substr_file = std::string(full_file, start_pos, buffer_len); - written = fs.view_fasta_chunk_cached(cache, buffer, buffer_len, start_pos); + written = fs.view_fasta_chunk(cache, buffer, buffer_len, start_pos); std_buffer = std::string(buffer, substr_file.size()); BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << start_pos << " and of length: " << buffer_len); BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << start_pos << " and of length: " << buffer_len); @@ -468,5 +472,142 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing2) +BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) +{ + std::string test_name = "test_004"; + std::string fasta_file = "test/data/" + test_name + ".fa"; + std::string fastafs_file = "tmp/" + test_name + ".fastafs"; + + fasta_to_fourbit_fastafs(fasta_file, fastafs_file); + fastafs fs = fastafs(test_name); + fs.load(fastafs_file); + + BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); + BOOST_REQUIRE_EQUAL(fs.fasta_filesize(32), 98); + + + char *buffer = new char[200];// buffer needs to be c buffer because of the fuse layer + flush_buffer(buffer, 200, '?'); + + ffs2f_init* cache_p1 = fs.init_ffs2f(1, true); + ffs2f_init* cache_p4 = fs.init_ffs2f(4, true); + ffs2f_init* cache_p5 = fs.init_ffs2f(5, true); + ffs2f_init* cache_p32 = fs.init_ffs2f(32, true);// allow masking = T + ffs2f_init* cache_p999 = fs.init_ffs2f(999, true); + + std::string std_buffer; + uint32_t written; + + + // padding = 32, offset = 0 + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 0); + BOOST_CHECK_EQUAL(written, 98); + std_buffer = std::string(buffer, 98); + BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 1 + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 1); + BOOST_CHECK_EQUAL(written, 97); + std_buffer = std::string(buffer, 97); + BOOST_CHECK_EQUAL(std_buffer.compare("IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 2 + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 2); + BOOST_CHECK_EQUAL(written, 96); + std_buffer = std::string(buffer, 96); + BOOST_CHECK_EQUAL(std_buffer.compare("UPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 5 + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 5); + BOOST_CHECK_EQUAL(written, 93); + std_buffer = std::string(buffer, 93); + BOOST_CHECK_EQUAL(std_buffer.compare("C\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 6 + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 6); + BOOST_CHECK_EQUAL(written, 92); + std_buffer = std::string(buffer, 92); + BOOST_CHECK_EQUAL(std_buffer.compare("\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 7 + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 7); + BOOST_CHECK_EQUAL(written, 91); + std_buffer = std::string(buffer, 91); + BOOST_CHECK_EQUAL(std_buffer.compare("NBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 8 + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 8); + BOOST_CHECK_EQUAL(written, 90); + std_buffer = std::string(buffer, 90); + BOOST_CHECK_EQUAL(std_buffer.compare("BKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 9 + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 9); + BOOST_CHECK_EQUAL(written, 89); + std_buffer = std::string(buffer, 89); + BOOST_CHECK_EQUAL(std_buffer.compare("KAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 10 + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 10); + BOOST_CHECK_EQUAL(written, 88); + std_buffer = std::string(buffer, 88); + BOOST_CHECK_EQUAL(std_buffer.compare("AHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + + // padding = 1, offset = 0 + written = fs.view_fasta_chunk(cache_p1, buffer, 200, 0); + BOOST_CHECK_EQUAL(written, 183); + std_buffer = std::string(buffer, 183); + BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nN\nB\nK\nA\nH\nM\nD\nC\nU\nW\nG\nS\nY\nV\nT\nR\nH\nG\nW\nV\nU\nM\nT\nB\nS\nD\nN\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\nB\nG\nY\nA\nD\nN\nH\nS\nM\nU\nT\nR\nC\nK\nW\nV\ns\nb\nh\nv\nd\nn\nr\nt\ng\ny\nc\nm\nk\nw\nu\na\nA\nV\nT\nS\nD\nK\nN\nB\n-\n-\n-\nU\nG\nW\nM\nH\nY\nR\nC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 5, offset = 0 + written = fs.view_fasta_chunk(cache_p5, buffer, 200, 0); + BOOST_CHECK_EQUAL(written, 113); + std_buffer = std::string(buffer, 113); + BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAH\nMDCUW\nGSYVT\nRHGWV\nUMTBS\nDN---\n-----\n--BGY\nADNHS\nMUTRC\nKWVsb\nhvdnr\ntgycm\nkwuaA\nVTSDK\nNB---\nUGWMH\nYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 999, offset = 0 + written = fs.view_fasta_chunk(cache_p999, buffer, 200, 0); + BOOST_CHECK_EQUAL(written, 96); + std_buffer = std::string(buffer, 96); + BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN----------BGYADNHSMUTRCKWVsbhvdnrtgycmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + + + std::string full_file = ">IUPAC\nNBKA\nHMDC\nUWGS\nYVTR\nHGWV\nUMTB\nSDN-\n----\n----\n-BGY\nADNH\nSMUT\nRCKW\nVsbh\nvdnr\ntgyc\nmkwu\naAVT\nSDKN\nB---\nUGWM\nHYRC\n";// length = 117 + for(uint32_t offset = 0; offset < 62; ++offset) { + std::string substr_file = full_file.substr(offset, 200); + + written = fs.view_fasta_chunk(cache_p4, buffer, 200, offset); + std_buffer = std::string(buffer, substr_file.size()); + + BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << offset); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << offset); + + flush_buffer(buffer, 200, '?'); + } + + delete[] buffer; + + delete cache_p1; + delete cache_p4; + delete cache_p5; + delete cache_p999; +} + + BOOST_AUTO_TEST_SUITE_END() +