From 195f1994e90ce2c666c63051bc34acc626b5382f Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 19 Nov 2019 13:43:25 +0100 Subject: [PATCH 001/119] sav --- test/data/test_004.fa | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 test/data/test_004.fa diff --git a/test/data/test_004.fa b/test/data/test_004.fa new file mode 100644 index 00000000..3b713227 --- /dev/null +++ b/test/data/test_004.fa @@ -0,0 +1,10 @@ +>test IUB/IUPAC amino acid and nucleic acid codes +VYW-DVWYGBSWRRVDH-WHBYVYAUDAMNRCHYHDRGRBSNKAHSHCGYVBRS--GDSB +RRHMGWSGCDURK-URTVMSDCYYDDVYYNRKSWRHWRCNUSTTCCRKKTKATVNMMWMG +MWABHUUMBGTCHRGNYKBBBKHVHRVCNUSC-KMCSYDHDKK-AVMAD-KHTWAKMTAA +UWCDNGBRHHYCGSVHYSSTDWSGBRANUMMADDYMHYTURHSWSNBNRNWMAAGRCGRB +AKVYDCKKCUAWDNAUKMRVTDGGYGMKHYW-BADCKTNGBGDHGHMVKU-KSTWUGKKW +SMCYHUYAUGMRHMTUMWRARR--NR-GVBTVYVCWSNCGWRRCWWAGKAMAYCVK---C +GRGUVDCRHUSABANWSKAUCSYW-K-VCDM-DRGBDTUAHSWGUBHCAVCMUVUVHR-G +TAUCUUKWGNRCDYNNDCGGRCAAVAHDSDWMHNBWYRCBDWGVCDCGTMHNSHRVYMMK +YD-SBYSCHAGCACGNRYTSYUYKYA-TCDNAN-BRCVCN From d748e10ef3c13854d2a3499fa8ffb2234361e179 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 09:01:09 +0100 Subject: [PATCH 002/119] gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5160f778..8dc6ef5f 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,4 @@ repeats.txt build/ xcheck.sh *.fa.fai +*.o From 47f5e44d1896524fca8312c6e638c91ae677daee Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 09:23:29 +0100 Subject: [PATCH 003/119] new branch --- CMakeLists.txt | 8 +- include/fourbit_byte.hpp | 23 ++++ src/fourbit_byte.cpp | 142 ++++++++++++++++++++++++ test/CMakeLists.txt | 3 + test/fourbit_byte/test_fourbit_byte.cpp | 68 ++++++++++++ 5 files changed, 240 insertions(+), 4 deletions(-) create mode 100644 include/fourbit_byte.hpp create mode 100644 src/fourbit_byte.cpp create mode 100644 test/fourbit_byte/test_fourbit_byte.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d561ee13..0c5855ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(fastafs) # Do this once in a while - find different bugs #set(CMAKE_CXX_COMPILER "clang++") -set(PROJECT_VERSION "1.6.2") +set(PROJECT_VERSION "1.7.0") set(PACKAGE_URL "https://github.com/yhoogstrate/fastafs") set(PACKAGE_BUGREPORT "${PACKAGE_URL}/issues") @@ -161,16 +161,16 @@ enable_testing() add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) # 'make check' as alias for 'make test' -add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") +add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N) | ACUG(N) +add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-) add_test(test_cache "${BUILD_TEST_DIR}/test_cache") add_test(test_view "${BUILD_TEST_DIR}/test_view") -#add_test(test_tree "${BUILD_TEST_DIR}/test_tree") add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs") add_test(test_fastafs_as_ucsc2bit "${BUILD_TEST_DIR}/test_fastafs_as_ucsc2bit") add_test(test_ucsc2bit_to_fastafs "${BUILD_TEST_DIR}/test_ucsc2bit_to_fastafs") add_test(test_ucsc2bit_as_fasta "${BUILD_TEST_DIR}/test_ucsc2bit_as_fasta") add_test(test_utils "${BUILD_TEST_DIR}/test_utils") - +#add_test(test_tree "${BUILD_TEST_DIR}/test_tree") #find_program(CTEST_MEMORYCHECK_COMMAND NAMES valgrind) # 'ctest -T memcheck' #INCLUDE(Dart) diff --git a/include/fourbit_byte.hpp b/include/fourbit_byte.hpp new file mode 100644 index 00000000..a1a41145 --- /dev/null +++ b/include/fourbit_byte.hpp @@ -0,0 +1,23 @@ + +#ifndef fourbit_BYTE_HPP +#define fourbit_BYTE_HPP + +#include +#include "config.hpp" + +class fourbit_byte +{ +public: + static const char fourbit_alhpabet[17]; + static const char fourbit_hash[256][3]; + + unsigned char data; + void set(unsigned char, unsigned char); + void set(char*);// string met 4 bytes set + const char *get(void); + char *get(unsigned char); + + static unsigned char iterator_to_offset(unsigned int); +}; + +#endif diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp new file mode 100644 index 00000000..b0472562 --- /dev/null +++ b/src/fourbit_byte.cpp @@ -0,0 +1,142 @@ +#include +#include + +#include "config.hpp" + +#include "fourbit_byte.hpp" + + +/* + alphabet = ACGTURYKMSWBDHVN + +binary: IUPEC + +00000000 AA +00000001 AC +00000010 AG +... +00000010 NH +11111110 NV +11111111 NN + */ + +const char fourbit_byte::fourbit_alhpabet[17] = "ACGTURYKMSWBDHVN"; +const char fourbit_byte::fourbit_hash[256][3] = {"AA", "AC", "AG", "AT", "AU", "AR", "AY", "AK", "AM", "AS", "AW", "AB", "AD", "AH", "AV", "AN", "CA", "CC", "CG", "CT", "CU", "CR", "CY", "CK", "CM", "CS", "CW", "CB", "CD", "CH", "CV", "CN", "GA", "GC", "GG", "GT", "GU", "GR", "GY", "GK", "GM", "GS", "GW", "GB", "GD", "GH", "GV", "GN", "TA", "TC", "TG", "TT", "TU", "TR", "TY", "TK", "TM", "TS", "TW", "TB", "TD", "TH", "TV", "TN", "UA", "UC", "UG", "UT", "UU", "UR", "UY", "UK", "UM", "US", "UW", "UB", "UD", "UH", "UV", "UN", "RA", "RC", "RG", "RT", "RU", "RR", "RY", "RK", "RM", "RS", "RW", "RB", "RD", "RH", "RV", "RN", "YA", "YC", "YG", "YT", "YU", "YR", "YY", "YK", "YM", "YS", "YW", "YB", "YD", "YH", "YV", "YN", "KA", "KC", "KG", "KT", "KU", "KR", "KY", "KK", "KM", "KS", "KW", "KB", "KD", "KH", "KV", "KN", "MA", "MC", "MG", "MT", "MU", "MR", "MY", "MK", "MM", "MS", "MW", "MB", "MD", "MH", "MV", "MN", "SA", "SC", "SG", "ST", "SU", "SR", "SY", "SK", "SM", "SS", "SW", "SB", "SD", "SH", "SV", "SN", "WA", "WC", "WG", "WT", "WU", "WR", "WY", "WK", "WM", "WS", "WW", "WB", "WD", "WH", "WV", "WN", "BA", "BC", "BG", "BT", "BU", "BR", "BY", "BK", "BM", "BS", "BW", "BB", "BD", "BH", "BV", "BN", "DA", "DC", "DG", "DT", "DU", "DR", "DY", "DK", "DM", "DS", "DW", "DB", "DD", "DH", "DV", "DN", "HA", "HC", "HG", "HT", "HU", "HR", "HY", "HK", "HM", "HS", "HW", "HB", "HD", "HH", "HV", "HN", "VA", "VC", "VG", "VT", "VU", "VR", "VY", "VK", "VM", "VS", "VW", "VB", "VD", "VH", "VV", "VN", "NA", "NC", "NG", "NT", "NU", "NR", "NY", "NK", "NM", "NS", "NW", "NB", "ND", "NH", "NV", "NN"}; + + +/* + input \t output + 0 6 + 1 4 + 2 2 + 3 0 + + 4 6 + 5 4 + 6 2 + 7 0 + +not sure what the quickest way is - this way all calculations are done as ints, not as chars + + + */ +unsigned char fourbit_byte::iterator_to_offset(uint32_t iterator) +{ + return (unsigned char)((3 - (iterator % 4)) * 2); +} + +// @todo, offset needs to be second parameter +void fourbit_byte::set(unsigned char bit_offset, unsigned char nucleotide) +{ + // bit_offset must be: {0, 2, 4 or 6}; + // nucleotides must be: + // => T - 00, C - 01, A - 10, G - 11 + // => T - 00, C - 1, A - 2, G - 3 +#if DEBUG + if(bit_offset != 0 and bit_offset != 2 and bit_offset != 4 and bit_offset != 6) { + throw std::invalid_argument("fourbit_byte(bit_offset, ..) must be 0, 2, 4 or 6\n"); + } +#endif //DEBUG + //set bit(s): INPUT |= 1 << N; + //unset bit(s): INPUT &= ~(1 << N); + switch(nucleotide) { + case 0://NUCLEOTIDE_T (00) + // ??????00 + // 11?? ~(3 << bit_offset) + // data ???????? + this->data = (unsigned char)(this->data & ~(3 << bit_offset)); + break; + case 1://NUCLEOTIDE_C (01) + this->data = (unsigned char)(this->data & ~(2 << bit_offset)); + this->data = (unsigned char)(this->data | (1 << bit_offset)); + break; + case 2://NUCLEOTIDE_A (10) + this->data = (unsigned char)(this->data & ~(1 << bit_offset)); + this->data = (unsigned char)(this->data | (2 << bit_offset)); + break; + case 3://NUCLEOTIDE_G (11) + this->data = (unsigned char)(this->data | (nucleotide << bit_offset)); + break; +#if DEBUG + default: + throw std::invalid_argument("fourbit_byte::set(pos, nucleotide) invalid value\n"); + break; +#endif //DEBUG + } +}; + + +// input char "AACCCTTGG" +// N's are treated as 0, for some weird reason +void fourbit_byte::set(char* buffer) +{ + const std::array< unsigned char, 4> bit_offsets = {6, 4, 2, 0}; + for(unsigned char i = 0; i < 4; i++) { + switch(buffer[i]) { + case 't': + case 'T': + case 'n': + case 'N': + this->set(bit_offsets[i], 0); + break; + case 'c': + case 'C': + this->set(bit_offsets[i], 1); + break; + case 'a': + case 'A': + this->set(bit_offsets[i], 2); + break; + case 'g': + case 'G': + this->set(bit_offsets[i], 3); + break; +#if DEBUG + default: + throw std::invalid_argument("fourbit_byte::set(char *) invalid value\n"); + break; +#endif //DEBUG + } + } +} + +/** + * @brief fully decodes a fourbit byte, not referencing to a hash but allocating a new char*, + * slower than fourbit_byte::get(void) but capable of determining very ends +**/ +char *fourbit_byte::get(unsigned char length) +{ + char *seq = new char[length + 1]; + for(unsigned char i = 0; i < length; i++) { // length = 4: i = 0, 1, 2, 3 + seq[i] = fourbit_byte::fourbit_hash[this->data][i]; + } + seq[length] = '\0'; + return seq; +} + + + +const char *fourbit_byte::get() +{ + return fourbit_byte::fourbit_hash[this->data]; +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a647f085..6f336dfd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -24,6 +24,7 @@ add_executable(test_view view/test_view.cpp ../src/fasta_to_fasta add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_fourbit_byte fourbit_byte/test_fourbit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_utils utils/test_utils.cpp ../src/utils.cpp) @@ -43,6 +44,8 @@ set_target_properties(test_fastafs_as_ucsc2bit PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_twobit_byte PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_fourbit_byte + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_ucsc2bit_to_fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_ucsc2bit_as_fasta diff --git a/test/fourbit_byte/test_fourbit_byte.cpp b/test/fourbit_byte/test_fourbit_byte.cpp new file mode 100644 index 00000000..e74f3f57 --- /dev/null +++ b/test/fourbit_byte/test_fourbit_byte.cpp @@ -0,0 +1,68 @@ +#define BOOST_TEST_MODULE fourbit_byte + +#include + +#include "config.hpp" + +#include "fourbit_byte.hpp" + + +BOOST_AUTO_TEST_SUITE(Testing) + + +BOOST_AUTO_TEST_CASE(test_fourbit_conversions) +{ + char seq[5]; + seq[4] = '\0'; + fourbit_byte t; + seq[0] = 'A'; + seq[1] = 'A'; + seq[2] = 'A'; + seq[3] = 'A'; + t.set(seq);//10101010 = 170 + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); + BOOST_CHECK_EQUAL(t.data, 170); + + seq[0] = 'T'; + seq[1] = 'A'; + seq[2] = 'A'; + seq[3] = 'A'; + t.set(seq); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); + BOOST_CHECK_EQUAL(t.data, 42); + seq[0] = 'A'; + seq[1] = 'C'; + seq[2] = 'T'; + seq[3] = 'G'; + t.set(seq); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); + BOOST_CHECK_EQUAL(t.data, 147); + seq[0] = 'N'; + seq[1] = 'C'; + seq[2] = 'T'; + seq[3] = 'N'; + t.set(seq);//00 01 00 00 + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); + BOOST_CHECK_EQUAL(t.data, 16); +} + +BOOST_AUTO_TEST_CASE(test_fourbit_static_offset_conversion_test) +{ + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(0), 6); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(1), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(2), 2); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(3), 0); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(4), 6); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(5), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(6), 2); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(7), 0); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(8), 6); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(9), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(10), 2); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(11), 0); +} + + + + +BOOST_AUTO_TEST_SUITE_END() From 3eb3a8dea4ab264aebfbf207815a3279db377155 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 09:49:08 +0100 Subject: [PATCH 004/119] minor improvement --- src/fourbit_byte.cpp | 79 +++++++++++++++++++++++++++++++++++--------- src/twobit_byte.cpp | 12 +++---- 2 files changed, 69 insertions(+), 22 deletions(-) diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp index b0472562..c31cc63a 100644 --- a/src/fourbit_byte.cpp +++ b/src/fourbit_byte.cpp @@ -48,35 +48,82 @@ unsigned char fourbit_byte::iterator_to_offset(uint32_t iterator) // @todo, offset needs to be second parameter void fourbit_byte::set(unsigned char bit_offset, unsigned char nucleotide) { - // bit_offset must be: {0, 2, 4 or 6}; + // bit_offset must be: {0, or 4}; -> location in bits // nucleotides must be: // => T - 00, C - 01, A - 10, G - 11 // => T - 00, C - 1, A - 2, G - 3 #if DEBUG - if(bit_offset != 0 and bit_offset != 2 and bit_offset != 4 and bit_offset != 6) { - throw std::invalid_argument("fourbit_byte(bit_offset, ..) must be 0, 2, 4 or 6\n"); + if(bit_offset != 0 and bit_offset != 4) { + throw std::invalid_argument("fourbit_byte(bit_offset, ..) must be 0 or 4\n"); } #endif //DEBUG //set bit(s): INPUT |= 1 << N; //unset bit(s): INPUT &= ~(1 << N); + switch(nucleotide) { - case 0://NUCLEOTIDE_T (00) - // ??????00 - // 11?? ~(3 << bit_offset) - // data ???????? - this->data = (unsigned char)(this->data & ~(3 << bit_offset)); + case 0:// A (0000) + this->data = (unsigned char)(this->data & ~( (8+4+2+1) << bit_offset)); // set zero's break; - case 1://NUCLEOTIDE_C (01) - this->data = (unsigned char)(this->data & ~(2 << bit_offset)); - this->data = (unsigned char)(this->data | (1 << bit_offset)); + case 1:// C (0001) + this->data = (unsigned char)(this->data & ~( (8+4+2 ) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( ( 1) << bit_offset)); // set one's break; - case 2://NUCLEOTIDE_A (10) - this->data = (unsigned char)(this->data & ~(1 << bit_offset)); - this->data = (unsigned char)(this->data | (2 << bit_offset)); + case 2:// G (0010) + this->data = (unsigned char)(this->data & ~( (8+4 +1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( ( 2 ) << bit_offset)); // set one's break; - case 3://NUCLEOTIDE_G (11) - this->data = (unsigned char)(this->data | (nucleotide << bit_offset)); + case 3:// T (0011) + this->data = (unsigned char)(this->data & ~( (8+4 ) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( ( 2+1) << bit_offset)); // set one's break; + case 4:// U (0100) + this->data = (unsigned char)(this->data & ~( (8 +2+1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( ( 4 ) << bit_offset)); // set one's + break; + case 5:// R (0101) + this->data = (unsigned char)(this->data & ~( (8 +2 ) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( ( 4 +1) << bit_offset)); // set one's + break; + case 6:// Y (0110) + this->data = (unsigned char)(this->data & ~( (8 +1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( ( 4+2 ) << bit_offset)); // set one's + break; + case 7:// K (0111) + this->data = (unsigned char)(this->data & ~( (8 ) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( ( 4+2+1) << bit_offset)); // set one's + break; + case 8:// M (1000) + this->data = (unsigned char)(this->data & ~( ( 4+2+1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( (8 ) << bit_offset)); // set one's + break; + case 9:// S (1001) + this->data = (unsigned char)(this->data & ~( ( 4+2 ) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( (8 +1) << bit_offset)); // set one's + break; + case 10:// W (1010) + this->data = (unsigned char)(this->data & ~( ( 4 +1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( (8 +2 ) << bit_offset)); // set one's + break; + case 11:// B (1011) + this->data = (unsigned char)(this->data & ~( ( 4 ) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( (8 +2+1) << bit_offset)); // set one's + break; + case 12:// D (1100) + this->data = (unsigned char)(this->data & ~( ( 2+1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( (8+4 ) << bit_offset)); // set one's + break; + case 13:// H (1101) + this->data = (unsigned char)(this->data & ~( ( 2 ) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( (8+4 +1) << bit_offset)); // set one's + break; + case 14:// V (1110) + this->data = (unsigned char)(this->data & ~( ( +1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ( (8+4+2 ) << bit_offset)); // set one's + break; + case 15:// N (1111) + this->data = (unsigned char)(this->data | ( (8+4+2+1) << bit_offset)); // set one's + break; + #if DEBUG default: throw std::invalid_argument("fourbit_byte::set(pos, nucleotide) invalid value\n"); diff --git a/src/twobit_byte.cpp b/src/twobit_byte.cpp index b1cfb812..e3bed55f 100644 --- a/src/twobit_byte.cpp +++ b/src/twobit_byte.cpp @@ -48,18 +48,18 @@ void twobit_byte::set(unsigned char bit_offset, unsigned char nucleotide) // ??????00 // 11?? ~(3 << bit_offset) // data ???????? - this->data = (unsigned char)(this->data & ~(3 << bit_offset)); + this->data = (unsigned char)(this->data & ~( (2+1) << bit_offset)); break; case 1://NUCLEOTIDE_C (01) - this->data = (unsigned char)(this->data & ~(2 << bit_offset)); - this->data = (unsigned char)(this->data | (1 << bit_offset)); + this->data = (unsigned char)(this->data & ~( (2 ) << bit_offset)); + this->data = (unsigned char)(this->data | ( ( 1) << bit_offset)); break; case 2://NUCLEOTIDE_A (10) - this->data = (unsigned char)(this->data & ~(1 << bit_offset)); - this->data = (unsigned char)(this->data | (2 << bit_offset)); + this->data = (unsigned char)(this->data & ~( ( 1) << bit_offset)); + this->data = (unsigned char)(this->data | ( (2 ) << bit_offset)); break; case 3://NUCLEOTIDE_G (11) - this->data = (unsigned char)(this->data | (nucleotide << bit_offset)); + this->data = (unsigned char)(this->data | ( (2+1) << bit_offset)); break; #if DEBUG default: From aee3202326024caec2558dcbdfc595fc6ef0712a Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 11:52:47 +0100 Subject: [PATCH 005/119] temporary save --- CMakeLists.txt | 13 +- include/fasta_to_fourbit_fastafs.hpp | 61 ++++ ...astafs.hpp => fasta_to_twobit_fastafs.hpp} | 6 +- src/fasta_to_fourbit_fastafs.cpp | 312 ++++++++++++++++++ ...astafs.cpp => fasta_to_twobit_fastafs.cpp} | 32 +- src/fourbit_byte.cpp | 94 ++++-- src/fuse.cpp | 2 +- src/main.cpp | 4 +- src/twobit_byte.cpp | 1 + test/CMakeLists.txt | 26 +- test/cache/test_cache_fourbit.cpp | 211 ++++++++++++ .../{test_cache.cpp => test_cache_twobit.cpp} | 10 +- test/fastafs/test_fastafs.cpp | 16 +- test/fastafs/test_ucsc2bit.cpp | 6 +- test/fourbit_byte/test_fourbit_byte.cpp | 101 ++++-- test/ucsc2bit/test_ucsc2bit_as_fasta.cpp | 4 +- .../test_ucsc2bit_to_fastafs.cpp | 6 +- test/view/test_view.cpp | 10 +- 18 files changed, 800 insertions(+), 115 deletions(-) create mode 100644 include/fasta_to_fourbit_fastafs.hpp rename include/{fasta_to_fastafs.hpp => fasta_to_twobit_fastafs.hpp} (86%) create mode 100644 src/fasta_to_fourbit_fastafs.cpp rename src/{fasta_to_fastafs.cpp => fasta_to_twobit_fastafs.cpp} (94%) create mode 100644 test/cache/test_cache_fourbit.cpp rename test/cache/{test_cache.cpp => test_cache_twobit.cpp} (97%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0c5855ff..196814ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -100,7 +100,7 @@ endif() add_executable(fastafs src/main.cpp - src/fasta_to_fastafs.cpp + src/fasta_to_twobit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp src/fastafs.cpp src/ucsc2bit.cpp @@ -115,7 +115,7 @@ set_target_properties(fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}" # mount-only binary, without all the other stuff 'mount.fastafs' [for fstab] add_executable(mount.fastafs src/main_mount.cpp - src/fasta_to_fastafs.cpp + src/fasta_to_twobit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp src/fastafs.cpp src/ucsc2bit.cpp @@ -128,11 +128,12 @@ add_executable(mount.fastafs set_target_properties(mount.fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}") add_library(libfastafs SHARED - src/fasta_to_fastafs.cpp + src/fasta_to_twobit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp src/fastafs.cpp src/ucsc2bit.cpp src/twobit_byte.cpp + src/fourbit_byte.cpp src/database.cpp src/utils.cpp src/fuse.cpp @@ -148,7 +149,7 @@ set_target_properties(libfastafs PROPERTIES OUTPUT_NAME fastafs) #set_target_properties(libfastafs PROPERTIES HEADER_OUTPUT_DIRECTORY "include") # great, this doesn't go automagically with an entire dir -set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_fastafs.hpp;include/fuse.hpp;include/meson.build;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp") +set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_twobit_fastafs.hpp;include/fuse.hpp;include/meson.build;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp") #set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_DIRECTORY include) #set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_OUTPUT_DIRECTORY "include") @@ -162,8 +163,8 @@ enable_testing() add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) # 'make check' as alias for 'make test' add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N) | ACUG(N) -add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-) -add_test(test_cache "${BUILD_TEST_DIR}/test_cache") +add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-) +add_test(test_cache_twobit "${BUILD_TEST_DIR}/test_cache_twobit") add_test(test_view "${BUILD_TEST_DIR}/test_view") add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs") add_test(test_fastafs_as_ucsc2bit "${BUILD_TEST_DIR}/test_fastafs_as_ucsc2bit") diff --git a/include/fasta_to_fourbit_fastafs.hpp b/include/fasta_to_fourbit_fastafs.hpp new file mode 100644 index 00000000..58985694 --- /dev/null +++ b/include/fasta_to_fourbit_fastafs.hpp @@ -0,0 +1,61 @@ + +#include + +#include + +#include "config.hpp" +#include "utils.hpp" + +#include "fastafs.hpp" +#include "fourbit_byte.hpp" + + + +class fasta_seq_header_fourbit_conversion_data +{ +public: + void add_ACTG(unsigned char, std::ofstream &);//Adds a T or a U + void add_N(); + void finish_sequence(std::ofstream &); + + off_t file_offset_in_fasta; // file positions where sequence data blocks start + std::string name; + + uint32_t N;// number of N (unknown) nucleotides (n - N = total 2bit compressed nucleotides) + uint32_t n_actg;// number of non-N nucleotides (any [ACTGU]) + + bool previous_was_N; + + + fasta_seq_header_fourbit_conversion_data(off_t fof, std::string name): + file_offset_in_fasta(fof), + name(name), + N(0), + n_actg(0), + previous_was_N(false), + in_m_block(false) + { + MD5_Init(&this->ctx); + } + + + // all below are undefined at initialization + uint32_t padding; + + // the followin should be member of a conversion struct, because they're not related to the original 2bit format: + MD5_CTX ctx; + unsigned char md5_digest[MD5_DIGEST_LENGTH]; + + std::vector n_block_starts; + std::vector n_block_ends; + + std::vector m_block_starts; + std::vector m_block_ends; + bool in_m_block; + + fourbit_byte fourbit_data; +}; + + +size_t fasta_to_fourbit_fastafs(const std::string, const std::string); + diff --git a/include/fasta_to_fastafs.hpp b/include/fasta_to_twobit_fastafs.hpp similarity index 86% rename from include/fasta_to_fastafs.hpp rename to include/fasta_to_twobit_fastafs.hpp index c93a8492..5bb7306d 100644 --- a/include/fasta_to_fastafs.hpp +++ b/include/fasta_to_twobit_fastafs.hpp @@ -11,7 +11,7 @@ -class fasta_seq_header_conversion_data +class fasta_seq_header_twobit_conversion_data { public: void add_ACTG(unsigned char, std::ofstream &);//Adds a T or a U @@ -27,7 +27,7 @@ class fasta_seq_header_conversion_data bool previous_was_N; - fasta_seq_header_conversion_data(off_t fof, std::string name): + fasta_seq_header_twobit_conversion_data(off_t fof, std::string name): file_offset_in_fasta(fof), name(name), N(0), @@ -57,5 +57,5 @@ class fasta_seq_header_conversion_data }; -size_t fasta_to_fastafs(const std::string, const std::string); +size_t fasta_to_twobit_fastafs(const std::string, const std::string); diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp new file mode 100644 index 00000000..bed6ab93 --- /dev/null +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -0,0 +1,312 @@ +#include +#include + +#include "config.hpp" + +#include "fasta_to_fourbit_fastafs.hpp" +#include "utils.hpp" + + + +const static char nt[2] = "T"; +const static char nc[2] = "C"; +const static char na[2] = "A"; +const static char ng[2] = "G"; +const static char nn[2] = "N"; + + + +void fasta_seq_header_fourbit_conversion_data::add_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) +{ + this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), nucleotide);//0 = TU, 1 = + + // if fourth nucleotide, 2bit is complete; write to disk + if(this->n_actg % 4 == 3) { + fh_fastafs << this->fourbit_data.data; + } + + if(this->previous_was_N) { + this->n_block_ends.push_back(this->n_actg + this->N - 1); + } + + this->previous_was_N = false; + this->n_actg++; +} + +void fasta_seq_header_fourbit_conversion_data::add_N() +{ + if(!this->previous_was_N) { + this->n_block_starts.push_back(this->n_actg + this->N); + } + + this->previous_was_N = true; + this->N++; +} + + + +void fasta_seq_header_fourbit_conversion_data::finish_sequence(std::ofstream &fh_fastafs) +{ + uint32_t j; + + // flush last nucleotide + if(this->n_actg % 4 != 0) { + for(j = this->n_actg % 4; j < 4; j++) { + this->fourbit_data.set(fourbit_byte::iterator_to_offset(j), 0); + } + fh_fastafs << this->fourbit_data.data; + } + + if(this->previous_was_N) { + this->n_block_ends.push_back(this->n_actg + this->N - 1); + } + + // do M block + if(this->in_m_block) { + this->m_block_ends.push_back(this->n_actg + this->N - 1); + //printf("closing m-block: %u\n",this->n_actg + this->N - 1); + } + +#if DEBUG + if(this->m_block_starts.size() != this->m_block_ends.size()) { + throw std::runtime_error("M blocks not correctly parsed\n"); + } +#endif //DEBUG + + char buffer[4 + 1]; + + // (over)write number nucleotides + std::streamoff index_file_position = fh_fastafs.tellp(); + fh_fastafs.seekp(this->file_offset_in_fasta, std::ios::beg); + uint_to_fourbytes(buffer, this->n_actg); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + + fh_fastafs.seekp(index_file_position, std::ios::beg); + + // N blocks + uint_to_fourbytes(buffer, (uint32_t) this->n_block_starts.size()); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + for(j = 0; j < this->n_block_starts.size(); j++) { + uint_to_fourbytes(buffer, this->n_block_starts[j]); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + } + for(j = 0; j < this->n_block_ends.size(); j++) { + uint_to_fourbytes(buffer, this->n_block_ends[j]); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + } + + // write checksum + MD5_Final(this->md5_digest, &this->ctx); + fh_fastafs.write(reinterpret_cast(&this->md5_digest), (size_t) 16); + + // M blocks + uint_to_fourbytes(buffer, (uint32_t) this->m_block_starts.size()); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + for(j = 0; j < this->m_block_starts.size(); j++) { + uint_to_fourbytes(buffer, this->m_block_starts[j]); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + } + for(j = 0; j < this->m_block_ends.size(); j++) { + uint_to_fourbytes(buffer, this->m_block_ends[j]); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + } +} + + + +size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string fastafs_file) +{ + std::vector index; + fasta_seq_header_fourbit_conversion_data* s; + + // @todo use ifstream and ofstream argument types + std::string line; + std::ifstream fh_fasta(fasta_file.c_str(), std::ios :: in | std::ios :: binary); + std::ofstream fh_fastafs(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); + if(fh_fasta.is_open() and fh_fastafs.is_open()) { + fh_fastafs << FASTAFS_MAGIC; + fh_fastafs << FASTAFS_VERSION; + fh_fastafs << "\x00\x00"s;// the flag for now, set to INCOMPLETE as writing is in progress + fh_fastafs << "\x00\x00\x00\x00"s;// position of metedata ~ unknown YET + + // iterate until first sequence is found, ensuring we won't write to uninitialized sequences + s = nullptr; + while(s == nullptr and getline(fh_fasta, line)) { + if(line[0] == '>') { + line.erase(0, 1);// erases first part, quicker would be pointer from first char + s = new fasta_seq_header_fourbit_conversion_data(fh_fastafs.tellp(), line); + fh_fastafs << "\x00\x00\x00\x00"s;// placeholder for sequence length + index.push_back(s); + } + } + + if(s != nullptr) { + while(getline(fh_fasta, line)) { + if(line[0] == '>') { + s->finish_sequence(fh_fastafs); + line.erase(0, 1);// erases first part, quicker would be pointer from first char + + s = new fasta_seq_header_fourbit_conversion_data(fh_fastafs.tellp(), line); + fh_fastafs << "\x00\x00\x00\x00"s;// number of 2bit encoded nucleotides, not yet known + index.push_back(s); + } else { + for(std::string::iterator it = line.begin(); it != line.end(); ++it) { + switch(*it) { + + case 'U': + case 'T': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(NUCLEOTIDE_T, fh_fastafs); + MD5_Update(&s->ctx, nt, 1);// this needs to be pu in add_nucleotide + break; + case 'u':// lower case = m block + case 't': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(NUCLEOTIDE_T, fh_fastafs); + MD5_Update(&s->ctx, nt, 1);// this needs to be pu in add_nucleotide + break; + case 'C': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(NUCLEOTIDE_C, fh_fastafs); + MD5_Update(&s->ctx, nc, 1); + break; + case 'c': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(NUCLEOTIDE_C, fh_fastafs); + MD5_Update(&s->ctx, nc, 1); + break; + case 'A': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(NUCLEOTIDE_A, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'a': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(NUCLEOTIDE_A, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'G': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(NUCLEOTIDE_G, fh_fastafs); + MD5_Update(&s->ctx, ng, 1); + break; + case 'g': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(NUCLEOTIDE_G, fh_fastafs); + MD5_Update(&s->ctx, ng, 1); + break; + case 'N': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_N(); + MD5_Update(&s->ctx, nn, 1); + break; + case 'n': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_N(); + MD5_Update(&s->ctx, nn, 1); + break; + default: + std::cerr << "invalid chars in FASTA file" << std::endl; + exit(1); + break; + } + } + } + } + } + fh_fasta.close(); + } + if(s != nullptr) { + s->finish_sequence(fh_fastafs);// finish last sequence + } + + // write index/footer + unsigned int index_file_position = (uint32_t) fh_fastafs.tellp(); + char buffer[4 + 1]; + uint_to_fourbytes(buffer, (uint32_t) index.size()); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + + for(size_t i = 0; i < index.size(); i++) { + s = index[i]; + + // flag + fh_fastafs << "\x00\x08"s; + + // name + unsigned char name_size = (unsigned char) s->name.size(); + fh_fastafs.write((char *) &name_size, 1); // name size + fh_fastafs.write(s->name.c_str(), (size_t) s->name.size());// name + + // location of sequence data in file + uint_to_fourbytes(buffer, (uint32_t) s->file_offset_in_fasta); + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + delete s; + } + fh_fastafs << "\x00"s;// no metadata tags (YET) + + // update header: set to updated + fh_fastafs.seekp(8, std::ios::beg); + fh_fastafs << "\x00\x01"s; // updated flag + + uint_to_fourbytes(buffer, index_file_position);//position of header + fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + + // calc written size + fh_fastafs.seekp(0, std::ios::end); + size_t written = fh_fastafs.tellp(); + + fh_fasta.close(); + fh_fastafs.close(); + + return written; +} diff --git a/src/fasta_to_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp similarity index 94% rename from src/fasta_to_fastafs.cpp rename to src/fasta_to_twobit_fastafs.cpp index fc1a1b81..b28bd3ef 100644 --- a/src/fasta_to_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -3,12 +3,20 @@ #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "utils.hpp" -void fasta_seq_header_conversion_data::add_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) +const static char nt[2] = "T"; +const static char nc[2] = "C"; +const static char na[2] = "A"; +const static char ng[2] = "G"; +const static char nn[2] = "N"; + + + +void fasta_seq_header_twobit_conversion_data::add_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) { this->twobit_data.set(twobit_byte::iterator_to_offset(this->n_actg), nucleotide);//0 = TU, 1 = @@ -25,7 +33,7 @@ void fasta_seq_header_conversion_data::add_ACTG(unsigned char nucleotide, std::o this->n_actg++; } -void fasta_seq_header_conversion_data::add_N() +void fasta_seq_header_twobit_conversion_data::add_N() { if(!this->previous_was_N) { this->n_block_starts.push_back(this->n_actg + this->N); @@ -37,7 +45,7 @@ void fasta_seq_header_conversion_data::add_N() -void fasta_seq_header_conversion_data::finish_sequence(std::ofstream &fh_fastafs) +void fasta_seq_header_twobit_conversion_data::finish_sequence(std::ofstream &fh_fastafs) { uint32_t j; @@ -106,16 +114,10 @@ void fasta_seq_header_conversion_data::finish_sequence(std::ofstream &fh_fastafs -const static char nt[2] = "T"; -const static char nc[2] = "C"; -const static char na[2] = "A"; -const static char ng[2] = "G"; -const static char nn[2] = "N"; - -size_t fasta_to_fastafs(const std::string fasta_file, const std::string fastafs_file) +size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string fastafs_file) { - std::vector index; - fasta_seq_header_conversion_data* s; + std::vector index; + fasta_seq_header_twobit_conversion_data* s; // @todo use ifstream and ofstream argument types std::string line; @@ -132,7 +134,7 @@ size_t fasta_to_fastafs(const std::string fasta_file, const std::string fastafs_ while(s == nullptr and getline(fh_fasta, line)) { if(line[0] == '>') { line.erase(0, 1);// erases first part, quicker would be pointer from first char - s = new fasta_seq_header_conversion_data(fh_fastafs.tellp(), line); + s = new fasta_seq_header_twobit_conversion_data(fh_fastafs.tellp(), line); fh_fastafs << "\x00\x00\x00\x00"s;// placeholder for sequence length index.push_back(s); } @@ -144,7 +146,7 @@ size_t fasta_to_fastafs(const std::string fasta_file, const std::string fastafs_ s->finish_sequence(fh_fastafs); line.erase(0, 1);// erases first part, quicker would be pointer from first char - s = new fasta_seq_header_conversion_data(fh_fastafs.tellp(), line); + s = new fasta_seq_header_twobit_conversion_data(fh_fastafs.tellp(), line); fh_fastafs << "\x00\x00\x00\x00"s;// number of 2bit encoded nucleotides, not yet known index.push_back(s); } else { diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp index c31cc63a..231500f1 100644 --- a/src/fourbit_byte.cpp +++ b/src/fourbit_byte.cpp @@ -137,31 +137,79 @@ void fourbit_byte::set(unsigned char bit_offset, unsigned char nucleotide) // N's are treated as 0, for some weird reason void fourbit_byte::set(char* buffer) { - const std::array< unsigned char, 4> bit_offsets = {6, 4, 2, 0}; - for(unsigned char i = 0; i < 4; i++) { + const std::array< unsigned char, 4> bit_offsets = {4, 0}; + for(unsigned char i = 0; i < 2; i++) { switch(buffer[i]) { - case 't': - case 'T': - case 'n': - case 'N': - this->set(bit_offsets[i], 0); - break; - case 'c': - case 'C': - this->set(bit_offsets[i], 1); - break; - case 'a': - case 'A': - this->set(bit_offsets[i], 2); - break; - case 'g': - case 'G': - this->set(bit_offsets[i], 3); - break; + + case 'A':// A (0000) + case 'a': + this->set(bit_offsets[i], 0); + break; + case 'C':// C (0001) + case 'c': + this->set(bit_offsets[i], 1); + break; + case 'G':// G (0010) + case 'g': + this->set(bit_offsets[i], 2); + break; + case 'T':// T (0011) + case 't': + this->set(bit_offsets[i], 3); + break; + case 'U':// U (0100) + case 'u': + this->set(bit_offsets[i], 4); + break; + case 'R':// R (0101) + case 'r': + this->set(bit_offsets[i], 5); + break; + case 'Y':// Y (0110) + case 'y': + this->set(bit_offsets[i], 6); + break; + case 'K':// K (0111) + case 'k': + this->set(bit_offsets[i], 7); + break; + case 'M':// M (1000) + case 'm': + this->set(bit_offsets[i], 8); + break; + case 'S':// S (1001) + case 's': + this->set(bit_offsets[i], 9); + break; + case 'W':// W (1010) + case 'w': + this->set(bit_offsets[i], 10); + break; + case 'B':// B (1011) + case 'b': + this->set(bit_offsets[i], 11); + break; + case 'D':// D (1100) + case 'd': + this->set(bit_offsets[i], 12); + break; + case 'H':// H (1101) + case 'h': + this->set(bit_offsets[i], 13); + break; + case 'V':// V (1110) + case 'v': + this->set(bit_offsets[i], 14); + break; + case 'N':// N (1111) + case 'n': + this->set(bit_offsets[i], 15); + break; + #if DEBUG - default: - throw std::invalid_argument("fourbit_byte::set(char *) invalid value\n"); - break; + default: + throw std::invalid_argument("fourbit_byte::set(char *) invalid value\n"); + break; #endif //DEBUG } } diff --git a/src/fuse.cpp b/src/fuse.cpp index d71ceed6..3c5fcbe9 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -421,7 +421,7 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) char current_argument = '\0';// could be o for '-o', etc. std::vector full_args = {}; - for(unsigned int i = 0; i < argc; ++i) { + for(signed int i = 0; i < argc; ++i) { printf("processing argv[%i] = '%s' [current argument=%i]\n", i, argv[i], (int) current_argument); if(current_argument != '\0') { // parse the arguments' value diff --git a/src/main.cpp b/src/main.cpp index b58bb48a..1e4697ec 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,7 +6,7 @@ #include #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "ucsc2bit_to_fastafs.hpp" #include "database.hpp" #include "fuse.hpp" @@ -96,7 +96,7 @@ int main(int argc, char *argv[]) std::string fname_out = d.add(argv[argc - 2]); if(is_fasta_file(argv[argc - 1])) { - fasta_to_fastafs(argv[argc - 1], fname_out); + fasta_to_twobit_fastafs(argv[argc - 1], fname_out); } else { ucsc2bit_to_fastafs(argv[argc - 1], fname_out); } diff --git a/src/twobit_byte.cpp b/src/twobit_byte.cpp index e3bed55f..a4af1b9d 100644 --- a/src/twobit_byte.cpp +++ b/src/twobit_byte.cpp @@ -72,6 +72,7 @@ void twobit_byte::set(unsigned char bit_offset, unsigned char nucleotide) // input char "AACCCTTGG" // N's are treated as 0, for some weird reason +// this function seems specific for UCSC 2 bit format?! - if so, denote it like that void twobit_byte::set(char* buffer) { const std::array< unsigned char, 4> bit_offsets = {6, 4, 2, 0}; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6f336dfd..5144c37f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,26 +18,27 @@ set(BUILD_DIR "../bin") set(BUILD_TEST_DIR "${BUILD_DIR}/test") -add_executable(test_cache cache/test_cache.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_view view/test_view.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -#add_executable(test_tree tree/test_tree.cpp) -add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_fourbit_byte fourbit_byte/test_fourbit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_fastafs.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_fourbit_byte fourbit_byte/test_fourbit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_cache_twobit cache/test_cache_twobit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_cache_fourbit cache/test_cache_fourbit.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_utils utils/test_utils.cpp ../src/utils.cpp) +#add_executable(test_tree tree/test_tree.cpp) add_test(test_ucsc2bit_to_fasta "${BUILD_TEST_DIR}/test_ucsc2bit_to_fasta") -set_target_properties(test_cache +set_target_properties(test_cache_twobit + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_cache_fourbit PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_view PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") -#set_target_properties(test_tree -# PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_fastafs_as_ucsc2bit @@ -52,3 +53,6 @@ set_target_properties(test_ucsc2bit_as_fasta PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_utils PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +#set_target_properties(test_tree +# PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") + diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp new file mode 100644 index 00000000..725593f2 --- /dev/null +++ b/test/cache/test_cache_fourbit.cpp @@ -0,0 +1,211 @@ +#define BOOST_TEST_MODULE fastfs_cache_fourbit + +#include + +#include "config.hpp" + +#include "fasta_to_fourbit_fastafs.hpp" + + + +BOOST_AUTO_TEST_SUITE(Testing) + +/** + * @brief + * + * @test + */ +BOOST_AUTO_TEST_CASE(test_equality_fourbit_byte) +{ + fourbit_byte b = fourbit_byte(); + + char *seq1; + char *seq2; + char *seq3; + char *seq4; + + const char *seq;// don't dereference, pointer to static two_bit property + + // test 0000 0000 -> 00000000 -> 0 + //b.set(6, NUCLEOTIDE_T); + //b.set(4, NUCLEOTIDE_T); + //b.set(2, NUCLEOTIDE_T); + //b.set(0, NUCLEOTIDE_T); + //BOOST_CHECK_EQUAL(b.data, 0); + + /* + seq1 = b.get(1); + seq2 = b.get(2); + seq3 = b.get(3); + seq4 = b.get(4); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "T"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "TT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq3, "TTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq4, "TTTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "TTTT"), 0); + + delete[] seq1; + delete[] seq2; + delete[] seq3; + delete[] seq4; + + + // test 00 00 11 00 -> 00001100 -> 8+4 -> 12 + b.set(6, NUCLEOTIDE_T); + b.set(4, NUCLEOTIDE_T); + b.set(2, NUCLEOTIDE_G); + b.set(0, NUCLEOTIDE_T); + BOOST_CHECK_EQUAL(b.data, 12); + + seq1 = b.get(1); + seq2 = b.get(2); + seq3 = b.get(3); + seq4 = b.get(4); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "T"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "TT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq3, "TTG"), 0); + BOOST_CHECK_EQUAL(strcmp(seq4, "TTGT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "TTGT"), 0); + + delete[] seq1; + delete[] seq2; + delete[] seq3; + delete[] seq4; + + + // test 00 11 00 00 -> 00110000 -> 16+32 -> 48 + b.set(6, NUCLEOTIDE_T); + b.set(4, NUCLEOTIDE_G); + b.set(2, NUCLEOTIDE_T); + b.set(0, NUCLEOTIDE_T); + BOOST_CHECK_EQUAL(b.data, 48); + + seq1 = b.get(1); + seq2 = b.get(2); + seq3 = b.get(3); + seq4 = b.get(4); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "T"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "TG"), 0); + BOOST_CHECK_EQUAL(strcmp(seq3, "TGT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq4, "TGTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "TGTT"), 0); + + delete[] seq1; + delete[] seq2; + delete[] seq3; + delete[] seq4; + + + // test 11 00 00 00 -> 11000000 -> 64+128 -> 192 + b.set(6, NUCLEOTIDE_G); + b.set(4, NUCLEOTIDE_T); + b.set(2, NUCLEOTIDE_T); + b.set(0, NUCLEOTIDE_T); + BOOST_CHECK_EQUAL(b.data, 192); + + seq1 = b.get(1); + seq2 = b.get(2); + seq3 = b.get(3); + seq4 = b.get(4); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "G"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "GT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq3, "GTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq4, "GTTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "GTTT"), 0); + + delete[] seq1; + delete[] seq2; + delete[] seq3; + delete[] seq4; + + + // test 10 01 01 10 -> 10010110 -> 2 + 4 + 16 + 128 -> 150 + b.set(6, NUCLEOTIDE_A); + b.set(4, NUCLEOTIDE_C); + b.set(2, NUCLEOTIDE_C); + b.set(0, NUCLEOTIDE_A); + BOOST_CHECK_EQUAL(b.data, 150); + + seq1 = b.get(1); + seq2 = b.get(2); + seq3 = b.get(3); + seq4 = b.get(4); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "A"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "AC"), 0); + BOOST_CHECK_EQUAL(strcmp(seq3, "ACC"), 0); + BOOST_CHECK_EQUAL(strcmp(seq4, "ACCA"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "ACCA"), 0); + + delete[] seq1; + delete[] seq2; + delete[] seq3; + delete[] seq4; + + + // test 11 11 11 11 -> 11111111 -> 255 + b.set(6, NUCLEOTIDE_G); + b.set(4, NUCLEOTIDE_G); + b.set(2, NUCLEOTIDE_G); + b.set(0, NUCLEOTIDE_G); + BOOST_CHECK_EQUAL(b.data, 255); + + seq1 = b.get(1); + seq2 = b.get(2); + seq3 = b.get(3); + seq4 = b.get(4); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "G"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "GG"), 0); + BOOST_CHECK_EQUAL(strcmp(seq3, "GGG"), 0); + BOOST_CHECK_EQUAL(strcmp(seq4, "GGGG"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "GGGG"), 0); + + delete[] seq1; + delete[] seq2; + delete[] seq3; + delete[] seq4; + + + // test 00 00 00 00 -> 00000000 -> 0 + b.set(6, NUCLEOTIDE_T); + b.set(4, NUCLEOTIDE_T); + b.set(2, NUCLEOTIDE_T); + b.set(0, NUCLEOTIDE_T); + BOOST_CHECK_EQUAL(b.data, 0); + + seq1 = b.get(1); + seq2 = b.get(2); + seq3 = b.get(3); + seq4 = b.get(4); + seq = b.get(); + + BOOST_CHECK_EQUAL(strcmp(seq1, "T"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "TT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq3, "TTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq4, "TTTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "TTTT"), 0); + + delete[] seq1; + delete[] seq2; + delete[] seq3; + delete[] seq4; + */ +} + + + + + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/cache/test_cache.cpp b/test/cache/test_cache_twobit.cpp similarity index 97% rename from test/cache/test_cache.cpp rename to test/cache/test_cache_twobit.cpp index 44279c1b..5bcb2468 100644 --- a/test/cache/test_cache.cpp +++ b/test/cache/test_cache_twobit.cpp @@ -1,11 +1,11 @@ -#define BOOST_TEST_MODULE fastfs_cache +#define BOOST_TEST_MODULE fastfs_cache_twobit #include #include "config.hpp" //#include "twobit_byte.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" @@ -224,7 +224,7 @@ BOOST_AUTO_TEST_CASE(Test_size) */ BOOST_AUTO_TEST_CASE(test_cache) { - size_t written = fasta_to_fastafs("test/data/test.fa", "tmp/test_cachce_test.fastafs"); + size_t written = fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cachce_test.fastafs"); static std::string reference = // GENERIC-HEADER @@ -358,7 +358,7 @@ BOOST_AUTO_TEST_CASE(test_cache) BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) { // generate FASTAFS file from FASTA file - fasta_to_fastafs("test/data/test.fa", "tmp/test_cachce_test.fastafs"); + fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cachce_test.fastafs"); // load the FASTAFS file fastafs f2 = fastafs("test"); @@ -418,7 +418,7 @@ BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) BOOST_AUTO_TEST_CASE(test_cache_with_newlines) { // generate FASTAFS file from FASTA file - fasta_to_fastafs("test/data/test_003.fa", "tmp/test_cachce_test_003.fastafs"); + fasta_to_twobit_fastafs("test/data/test_003.fa", "tmp/test_cachce_test_003.fastafs"); // load the FASTAFS file fastafs f2 = fastafs("test"); diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index c329ee6b..364dbd6e 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -8,7 +8,7 @@ #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "fastafs.hpp" @@ -36,7 +36,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); BOOST_REQUIRE(fs.data.size() > 0); @@ -95,7 +95,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); BOOST_REQUIRE(fs.data.size() > 0); @@ -134,7 +134,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0__no_masking) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -175,7 +175,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -196,7 +196,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_md5) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -226,7 +226,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1b) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test_002.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test_002.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -353,7 +353,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs__dict_virtualization) */ std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); diff --git a/test/fastafs/test_ucsc2bit.cpp b/test/fastafs/test_ucsc2bit.cpp index 09049c03..e11836d7 100644 --- a/test/fastafs/test_ucsc2bit.cpp +++ b/test/fastafs/test_ucsc2bit.cpp @@ -128,7 +128,7 @@ NNACTG #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "fastafs.hpp" @@ -153,7 +153,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_view_chunked_2bit) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); // 2. load fastafs fastafs fs = fastafs("test"); @@ -400,7 +400,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_view_chunked_2bit_with_offset) { // 1: create FASTAFS file std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); BOOST_REQUIRE(fs.data.size() > 0); diff --git a/test/fourbit_byte/test_fourbit_byte.cpp b/test/fourbit_byte/test_fourbit_byte.cpp index e74f3f57..bf2806bc 100644 --- a/test/fourbit_byte/test_fourbit_byte.cpp +++ b/test/fourbit_byte/test_fourbit_byte.cpp @@ -12,38 +12,83 @@ BOOST_AUTO_TEST_SUITE(Testing) BOOST_AUTO_TEST_CASE(test_fourbit_conversions) { - char seq[5]; - seq[4] = '\0'; + char seq[3]; + const char* seq_get; + seq[2] = '\0'; fourbit_byte t; - seq[0] = 'A'; - seq[1] = 'A'; - seq[2] = 'A'; - seq[3] = 'A'; - t.set(seq);//10101010 = 170 - printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); - BOOST_CHECK_EQUAL(t.data, 170); - - seq[0] = 'T'; - seq[1] = 'A'; - seq[2] = 'A'; - seq[3] = 'A'; - t.set(seq); - printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); - BOOST_CHECK_EQUAL(t.data, 42); + seq[0] = 'A'; seq[1] = 'C'; - seq[2] = 'T'; - seq[3] = 'G'; t.set(seq); - printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); - BOOST_CHECK_EQUAL(t.data, 147); - seq[0] = 'N'; - seq[1] = 'C'; - seq[2] = 'T'; - seq[3] = 'N'; - t.set(seq);//00 01 00 00 - printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); - BOOST_CHECK_EQUAL(t.data, 16); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 1); + + seq[0] = 'G'; + seq[1] = 'T'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 35); + + seq[0] = 'U'; + seq[1] = 'R'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 69); + + seq[0] = 'Y'; + seq[1] = 'K'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 103); + + seq[0] = 'M'; + seq[1] = 'S'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 137); + + seq[0] = 'W'; + seq[1] = 'B'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 171); + + seq[0] = 'D'; + seq[1] = 'H'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 205); + + seq[0] = 'V'; + seq[1] = 'N'; + t.set(seq); + seq_get = t.get(); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, seq_get); + BOOST_CHECK_EQUAL(seq[0], seq_get[0]); + BOOST_CHECK_EQUAL(seq[1], seq_get[1]); + BOOST_CHECK_EQUAL(t.data, 239); + } BOOST_AUTO_TEST_CASE(test_fourbit_static_offset_conversion_test) diff --git a/test/ucsc2bit/test_ucsc2bit_as_fasta.cpp b/test/ucsc2bit/test_ucsc2bit_as_fasta.cpp index 98103767..4b7a6e93 100644 --- a/test/ucsc2bit/test_ucsc2bit_as_fasta.cpp +++ b/test/ucsc2bit/test_ucsc2bit_as_fasta.cpp @@ -9,7 +9,7 @@ #include "config.hpp" #include "ucsc2bit.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "fastafs.hpp" @@ -37,7 +37,7 @@ BOOST_AUTO_TEST_CASE(test_ucsc2bit_to_fasta_file) // 1: FASTA to FASTAFS file: std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); BOOST_REQUIRE(fs.data.size() > 0); diff --git a/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp b/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp index 36361cab..191b4c0f 100644 --- a/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp +++ b/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp @@ -5,7 +5,7 @@ #include "config.hpp" #include "utils.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "fastafs.hpp" #include "ucsc2bit_to_fastafs.hpp" @@ -21,8 +21,8 @@ BOOST_AUTO_TEST_CASE(test_ucsc2bit_to_fasta) std::string fastafs_file2 = "tmp/test.regenerated.fastafs"; std::string ucsc2bit_file = "tmp/test.2bit"; - // 01 fasta_to_fastafs() - fasta_to_fastafs("test/data/test.fa", fastafs_file); + // 01 fasta_to_twobit_fastafs() + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); // 02 load fastafs fastafs fs = fastafs("test"); diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index 4bce700a..5d852948 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -8,7 +8,7 @@ #include "config.hpp" -#include "fasta_to_fastafs.hpp" +#include "fasta_to_twobit_fastafs.hpp" #include "fastafs.hpp" @@ -139,7 +139,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_twobit_offset_calc) std::string fastafs_file = "tmp/test.fastafs"; - fasta_to_fastafs("test/data/test.fa", fastafs_file); + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); fastafs fs = fastafs("test"); fs.load(fastafs_file); @@ -186,7 +186,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) std::string fasta_file = "test/data/" + test_name + ".fa"; std::string fastafs_file = "tmp/" + test_name + ".fastafs"; - fasta_to_fastafs(fasta_file, fastafs_file); + fasta_to_twobit_fastafs(fasta_file, fastafs_file); fastafs fs = fastafs(test_name); fs.load(fastafs_file); @@ -355,7 +355,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_sub) std::string fasta_file = "test/data/" + test_name + ".fa"; std::string fastafs_file = "tmp/" + test_name + ".fastafs"; - fasta_to_fastafs(fasta_file, fastafs_file); + fasta_to_twobit_fastafs(fasta_file, fastafs_file); fastafs fs = fastafs(test_name); fs.load(fastafs_file); @@ -395,7 +395,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing2) std::string fasta_file = "test/data/" + test_name + ".fa"; std::string fastafs_file = "tmp/" + test_name + ".fastafs"; - fasta_to_fastafs(fasta_file, fastafs_file); + fasta_to_twobit_fastafs(fasta_file, fastafs_file); fastafs fs = fastafs(test_name); fs.load(fastafs_file); From 024b2179f73981d77bed40aa77402f9887c0beb4 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 12:37:51 +0100 Subject: [PATCH 006/119] closing in --- src/fourbit_byte.cpp | 6 ++++ src/twobit_byte.cpp | 8 +++++ test/cache/test_cache_fourbit.cpp | 50 ++++++++++--------------------- 3 files changed, 29 insertions(+), 35 deletions(-) diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp index 231500f1..8673aa6f 100644 --- a/src/fourbit_byte.cpp +++ b/src/fourbit_byte.cpp @@ -221,6 +221,12 @@ void fourbit_byte::set(char* buffer) **/ char *fourbit_byte::get(unsigned char length) { +#if DEBUG + if(length > 2) { + throw std::invalid_argument("four_byte::get(unsigned char length) -> out of bound: " + std::to_string(length)+ "\n"); + } +#endif //DEBUG + char *seq = new char[length + 1]; for(unsigned char i = 0; i < length; i++) { // length = 4: i = 0, 1, 2, 3 seq[i] = fourbit_byte::fourbit_hash[this->data][i]; diff --git a/src/twobit_byte.cpp b/src/twobit_byte.cpp index a4af1b9d..0a6ac8ca 100644 --- a/src/twobit_byte.cpp +++ b/src/twobit_byte.cpp @@ -111,11 +111,19 @@ void twobit_byte::set(char* buffer) **/ char *twobit_byte::get(unsigned char length) { +#if DEBUG + if(length > 4) { + throw std::invalid_argument("twobit_byte::get(unsigned char length) -> out of bound: " + std::to_string(length) + "\n"); + } +#endif //DEBUG + char *seq = new char[length + 1]; + for(unsigned char i = 0; i < length; i++) { // length = 4: i = 0, 1, 2, 3 seq[i] = twobit_byte::twobit_hash[this->data][i]; } seq[length] = '\0'; + return seq; } diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 725593f2..7db59206 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -21,61 +21,41 @@ BOOST_AUTO_TEST_CASE(test_equality_fourbit_byte) char *seq1; char *seq2; - char *seq3; - char *seq4; - const char *seq;// don't dereference, pointer to static two_bit property + const char *seq;// don't dereference, pointer to static four_bit property // test 0000 0000 -> 00000000 -> 0 - //b.set(6, NUCLEOTIDE_T); - //b.set(4, NUCLEOTIDE_T); - //b.set(2, NUCLEOTIDE_T); - //b.set(0, NUCLEOTIDE_T); - //BOOST_CHECK_EQUAL(b.data, 0); + b.set(4, 0);// A => 0 + b.set(0, 0); + BOOST_CHECK_EQUAL(b.data, 0); - /* seq1 = b.get(1); seq2 = b.get(2); - seq3 = b.get(3); - seq4 = b.get(4); seq = b.get(); - BOOST_CHECK_EQUAL(strcmp(seq1, "T"), 0); - BOOST_CHECK_EQUAL(strcmp(seq2, "TT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq3, "TTT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq4, "TTTT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq, "TTTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq1, "A"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "AA"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "AA"), 0); delete[] seq1; delete[] seq2; - delete[] seq3; - delete[] seq4; - - // test 00 00 11 00 -> 00001100 -> 8+4 -> 12 - b.set(6, NUCLEOTIDE_T); - b.set(4, NUCLEOTIDE_T); - b.set(2, NUCLEOTIDE_G); - b.set(0, NUCLEOTIDE_T); - BOOST_CHECK_EQUAL(b.data, 12); + // test 11 10 11 11 -> 00001100 -> 239 + b.set(4, 14); // V: 14 + b.set(0, 15); // N: 15 + BOOST_CHECK_EQUAL(b.data, 239); seq1 = b.get(1); seq2 = b.get(2); - seq3 = b.get(3); - seq4 = b.get(4); seq = b.get(); - BOOST_CHECK_EQUAL(strcmp(seq1, "T"), 0); - BOOST_CHECK_EQUAL(strcmp(seq2, "TT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq3, "TTG"), 0); - BOOST_CHECK_EQUAL(strcmp(seq4, "TTGT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq, "TTGT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq1, "V"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "VN"), 0); delete[] seq1; delete[] seq2; - delete[] seq3; - delete[] seq4; - + +/* // test 00 11 00 00 -> 00110000 -> 16+32 -> 48 b.set(6, NUCLEOTIDE_T); From b499fc64c079b460e4c4157f7ca49136e390fcae Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 16:43:15 +0100 Subject: [PATCH 007/119] firstr part of cache test succeeded --- test/cache/test_cache_fourbit.cpp | 112 +++++++----------------------- 1 file changed, 26 insertions(+), 86 deletions(-) diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 7db59206..10eaf6a6 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -55,132 +55,72 @@ BOOST_AUTO_TEST_CASE(test_equality_fourbit_byte) delete[] seq1; delete[] seq2; -/* - - // test 00 11 00 00 -> 00110000 -> 16+32 -> 48 - b.set(6, NUCLEOTIDE_T); - b.set(4, NUCLEOTIDE_G); - b.set(2, NUCLEOTIDE_T); - b.set(0, NUCLEOTIDE_T); - BOOST_CHECK_EQUAL(b.data, 48); - - seq1 = b.get(1); - seq2 = b.get(2); - seq3 = b.get(3); - seq4 = b.get(4); - seq = b.get(); - - BOOST_CHECK_EQUAL(strcmp(seq1, "T"), 0); - BOOST_CHECK_EQUAL(strcmp(seq2, "TG"), 0); - BOOST_CHECK_EQUAL(strcmp(seq3, "TGT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq4, "TGTT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq, "TGTT"), 0); - - delete[] seq1; - delete[] seq2; - delete[] seq3; - delete[] seq4; - - - // test 11 00 00 00 -> 11000000 -> 64+128 -> 192 - b.set(6, NUCLEOTIDE_G); - b.set(4, NUCLEOTIDE_T); - b.set(2, NUCLEOTIDE_T); - b.set(0, NUCLEOTIDE_T); - BOOST_CHECK_EQUAL(b.data, 192); + // GT: 0010 0011 + b.set(4, 2); // G + b.set(0, 3); // T + BOOST_CHECK_EQUAL(b.data, 35); seq1 = b.get(1); seq2 = b.get(2); - seq3 = b.get(3); - seq4 = b.get(4); seq = b.get(); BOOST_CHECK_EQUAL(strcmp(seq1, "G"), 0); BOOST_CHECK_EQUAL(strcmp(seq2, "GT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq3, "GTT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq4, "GTTT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq, "GTTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "GT"), 0); delete[] seq1; delete[] seq2; - delete[] seq3; - delete[] seq4; - // test 10 01 01 10 -> 10010110 -> 2 + 4 + 16 + 128 -> 150 - b.set(6, NUCLEOTIDE_A); - b.set(4, NUCLEOTIDE_C); - b.set(2, NUCLEOTIDE_C); - b.set(0, NUCLEOTIDE_A); - BOOST_CHECK_EQUAL(b.data, 150); + // set to UR (0100 0101) + b.set(4, 4); + b.set(0, 5); + BOOST_CHECK_EQUAL(b.data, 69); seq1 = b.get(1); seq2 = b.get(2); - seq3 = b.get(3); - seq4 = b.get(4); seq = b.get(); - BOOST_CHECK_EQUAL(strcmp(seq1, "A"), 0); - BOOST_CHECK_EQUAL(strcmp(seq2, "AC"), 0); - BOOST_CHECK_EQUAL(strcmp(seq3, "ACC"), 0); - BOOST_CHECK_EQUAL(strcmp(seq4, "ACCA"), 0); - BOOST_CHECK_EQUAL(strcmp(seq, "ACCA"), 0); + BOOST_CHECK_EQUAL(strcmp(seq1, "U"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "UR"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "UR"), 0); delete[] seq1; delete[] seq2; - delete[] seq3; - delete[] seq4; - // test 11 11 11 11 -> 11111111 -> 255 - b.set(6, NUCLEOTIDE_G); - b.set(4, NUCLEOTIDE_G); - b.set(2, NUCLEOTIDE_G); - b.set(0, NUCLEOTIDE_G); - BOOST_CHECK_EQUAL(b.data, 255); + // set to AN (0000 1111) + b.set(4, 0); + b.set(0, 15); + BOOST_CHECK_EQUAL(b.data, 15); seq1 = b.get(1); seq2 = b.get(2); - seq3 = b.get(3); - seq4 = b.get(4); seq = b.get(); - BOOST_CHECK_EQUAL(strcmp(seq1, "G"), 0); - BOOST_CHECK_EQUAL(strcmp(seq2, "GG"), 0); - BOOST_CHECK_EQUAL(strcmp(seq3, "GGG"), 0); - BOOST_CHECK_EQUAL(strcmp(seq4, "GGGG"), 0); - BOOST_CHECK_EQUAL(strcmp(seq, "GGGG"), 0); + BOOST_CHECK_EQUAL(strcmp(seq1, "A"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "AN"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "AN"), 0); delete[] seq1; delete[] seq2; - delete[] seq3; - delete[] seq4; - // test 00 00 00 00 -> 00000000 -> 0 - b.set(6, NUCLEOTIDE_T); - b.set(4, NUCLEOTIDE_T); - b.set(2, NUCLEOTIDE_T); - b.set(0, NUCLEOTIDE_T); - BOOST_CHECK_EQUAL(b.data, 0); + // set to NA (1111 0000) + b.set(4, 15); + b.set(0, 0); + BOOST_CHECK_EQUAL(b.data, 240); seq1 = b.get(1); seq2 = b.get(2); - seq3 = b.get(3); - seq4 = b.get(4); seq = b.get(); - BOOST_CHECK_EQUAL(strcmp(seq1, "T"), 0); - BOOST_CHECK_EQUAL(strcmp(seq2, "TT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq3, "TTT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq4, "TTTT"), 0); - BOOST_CHECK_EQUAL(strcmp(seq, "TTTT"), 0); + BOOST_CHECK_EQUAL(strcmp(seq1, "N"), 0); + BOOST_CHECK_EQUAL(strcmp(seq2, "NA"), 0); + BOOST_CHECK_EQUAL(strcmp(seq, "NA"), 0); delete[] seq1; delete[] seq2; - delete[] seq3; - delete[] seq4; - */ } From 422cb5751c2f201c0e023aa1653d1ea44e454bbb Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 16:45:49 +0100 Subject: [PATCH 008/119] firstr part of cache test succeeded --- test/cache/test_cache_fourbit.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 10eaf6a6..90ad5b45 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -125,6 +125,18 @@ BOOST_AUTO_TEST_CASE(test_equality_fourbit_byte) +/** + * @brief + * + * @test tests whether a fourbit object is indeed stored as a single byte + */ +BOOST_AUTO_TEST_CASE(Test_size) +{ + fourbit_byte b = fourbit_byte(); + BOOST_CHECK_EQUAL(sizeof(b.data), 1); +} + + From 8c3a2f9f4b240af302b3827cae954916cf6fc683 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 17:01:31 +0100 Subject: [PATCH 009/119] doc --- doc/FASTAFS-FORMAT-SPECIFICATION.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/FASTAFS-FORMAT-SPECIFICATION.md b/doc/FASTAFS-FORMAT-SPECIFICATION.md index 0a1580e2..e1f79578 100644 --- a/doc/FASTAFS-FORMAT-SPECIFICATION.md +++ b/doc/FASTAFS-FORMAT-SPECIFICATION.md @@ -120,8 +120,20 @@ Repeated for every sequence, in order matching SEQUENCE-HEADER The sequence flag allows to describe the following metadata for each sequence: ``` -bit 0 is-rna [1 = yes, 0 = DNA] -bit 1 reserved [reserved, library type 2 -> protein] +bit 0 combined sequence +bit 1 type +``` + +| bit 0 | bit 1 | Type | +| ---- | ---- | - | +| `0` | `0` | DNA (`ACTG` + `N`) | +| `1` | `0` | RNA (`ACUG` + `N`) | +| `0` | `1` | Nucleotide IUPEC (`ACGTURYKMSWBDHVN` + `-`) | +| `1` | `1` | reserved | + +--- + +``` bit 2 reserved [reserved, library type 2 -> protein] bit 3 is-complete [1: checksum is present, 0: some regions are reserved but not yet 'downloaded'] bit 4 is-circular From 0502f53429b086652be66a827d5c9b0e58de92e3 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 17:03:54 +0100 Subject: [PATCH 010/119] spec --- doc/FASTAFS-FORMAT-SPECIFICATION.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/doc/FASTAFS-FORMAT-SPECIFICATION.md b/doc/FASTAFS-FORMAT-SPECIFICATION.md index e1f79578..a4af8675 100644 --- a/doc/FASTAFS-FORMAT-SPECIFICATION.md +++ b/doc/FASTAFS-FORMAT-SPECIFICATION.md @@ -120,18 +120,16 @@ Repeated for every sequence, in order matching SEQUENCE-HEADER The sequence flag allows to describe the following metadata for each sequence: ``` -bit 0 combined sequence -bit 1 type +bit 0 combined sequence type +bit 1 combined sequence type ``` -| bit 0 | bit 1 | Type | +| bit-0 | bit-1 | Type | Alphabet | | ---- | ---- | - | -| `0` | `0` | DNA (`ACTG` + `N`) | -| `1` | `0` | RNA (`ACUG` + `N`) | -| `0` | `1` | Nucleotide IUPEC (`ACGTURYKMSWBDHVN` + `-`) | -| `1` | `1` | reserved | - ---- +| `0` | `0` | DNA | `ACTG` + `N`) | +| `1` | `0` | RNA | `ACUG` + `N`) | +| `0` | `1` | IUPEC Nucleotide | `ACGTURYKMSWBDHVN` + `-` | +| `1` | `1` | reserved for protein | to be determined | ``` bit 2 reserved [reserved, library type 2 -> protein] From 2b2df5a8042a0367a09cd524254e28850bdd0ca2 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 17:04:48 +0100 Subject: [PATCH 011/119] tmp --- doc/FASTAFS-FORMAT-SPECIFICATION.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/FASTAFS-FORMAT-SPECIFICATION.md b/doc/FASTAFS-FORMAT-SPECIFICATION.md index a4af8675..0e335d8e 100644 --- a/doc/FASTAFS-FORMAT-SPECIFICATION.md +++ b/doc/FASTAFS-FORMAT-SPECIFICATION.md @@ -125,7 +125,7 @@ bit 1 combined sequence type ``` | bit-0 | bit-1 | Type | Alphabet | -| ---- | ---- | - | +| ---- | ---- | - | - | | `0` | `0` | DNA | `ACTG` + `N`) | | `1` | `0` | RNA | `ACUG` + `N`) | | `0` | `1` | IUPEC Nucleotide | `ACGTURYKMSWBDHVN` + `-` | From 00d387c960c61c5ba2a31a7c6ebd157a643836e4 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 27 Nov 2019 17:28:18 +0100 Subject: [PATCH 012/119] sav --- doc/FASTAFS-FORMAT-SPECIFICATION.md | 4 +- src/fasta_to_fourbit_fastafs.cpp | 311 +++++++++++++++++++++++++--- test/cache/test_cache_fourbit.cpp | 6 + 3 files changed, 286 insertions(+), 35 deletions(-) diff --git a/doc/FASTAFS-FORMAT-SPECIFICATION.md b/doc/FASTAFS-FORMAT-SPECIFICATION.md index 0e335d8e..9169724f 100644 --- a/doc/FASTAFS-FORMAT-SPECIFICATION.md +++ b/doc/FASTAFS-FORMAT-SPECIFICATION.md @@ -126,8 +126,8 @@ bit 1 combined sequence type | bit-0 | bit-1 | Type | Alphabet | | ---- | ---- | - | - | -| `0` | `0` | DNA | `ACTG` + `N`) | -| `1` | `0` | RNA | `ACUG` + `N`) | +| `0` | `0` | DNA | `ACTG` + `N` | +| `1` | `0` | RNA | `ACUG` + `N` | | `0` | `1` | IUPEC Nucleotide | `ACGTURYKMSWBDHVN` + `-` | | `1` | `1` | reserved for protein | to be determined | diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index bed6ab93..0d313ec4 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -8,20 +8,42 @@ -const static char nt[2] = "T"; -const static char nc[2] = "C"; const static char na[2] = "A"; +const static char nc[2] = "C"; const static char ng[2] = "G"; +const static char nt[2] = "T"; + +const static char nu[2] = "U"; +const static char nr[2] = "R"; +const static char ny[2] = "Y"; +const static char nk[2] = "K"; +const static char nm[2] = "M"; +const static char ns[2] = "S"; +const static char nw[2] = "W"; +const static char nb[2] = "B"; +const static char nd[2] = "D"; +const static char nh[2] = "H"; +const static char nv[2] = "V"; + const static char nn[2] = "N"; +//const char fourbit_byte::fourbit_alhpabet[17] = "ACGTURYKMSWBDHVN"; + void fasta_seq_header_fourbit_conversion_data::add_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) { + std::cout << this->n_actg; + std::cout << "\n"; + std::cout << " => " << std::to_string(fourbit_byte::iterator_to_offset(this->n_actg)) << "\n\n"; + + // 0 -> 4 + // 1 -> 0 toch? + this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), nucleotide);//0 = TU, 1 = // if fourth nucleotide, 2bit is complete; write to disk - if(this->n_actg % 4 == 3) { + if(this->n_actg % 2 == 1) { fh_fastafs << this->fourbit_data.data; } @@ -50,10 +72,10 @@ void fasta_seq_header_fourbit_conversion_data::finish_sequence(std::ofstream &fh uint32_t j; // flush last nucleotide - if(this->n_actg % 4 != 0) { - for(j = this->n_actg % 4; j < 4; j++) { + if(this->n_actg % 2 != 0) { + //for(j = this->n_actg % 2; j < 1; j++) { this->fourbit_data.set(fourbit_byte::iterator_to_offset(j), 0); - } + //} fh_fastafs << this->fourbit_data.data; } @@ -133,6 +155,8 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string s = nullptr; while(s == nullptr and getline(fh_fasta, line)) { if(line[0] == '>') { + + // init new sequence line.erase(0, 1);// erases first part, quicker would be pointer from first char s = new fasta_seq_header_fourbit_conversion_data(fh_fastafs.tellp(), line); fh_fastafs << "\x00\x00\x00\x00"s;// placeholder for sequence length @@ -144,8 +168,9 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string while(getline(fh_fasta, line)) { if(line[0] == '>') { s->finish_sequence(fh_fastafs); + + // init sequence line.erase(0, 1);// erases first part, quicker would be pointer from first char - s = new fasta_seq_header_fourbit_conversion_data(fh_fastafs.tellp(), line); fh_fastafs << "\x00\x00\x00\x00"s;// number of 2bit encoded nucleotides, not yet known index.push_back(s); @@ -153,7 +178,64 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string for(std::string::iterator it = line.begin(); it != line.end(); ++it) { switch(*it) { - case 'U': + case 'A': + if(s->in_m_block) { + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(0, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'a': + if(!s->in_m_block) { + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(0, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'C': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(1, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'c': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(1, fh_fastafs); + MD5_Update(&s->ctx, na, 1); + break; + case 'G': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(2, fh_fastafs); + MD5_Update(&s->ctx, ng, 1); + break; + case 'g': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(2, fh_fastafs); + MD5_Update(&s->ctx, ng, 1); + break; case 'T': if(s->in_m_block) { //printf("ending M block: %d\n", s->N + s->n_actg - 1); @@ -161,10 +243,9 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string s->in_m_block = false; } - s->add_ACTG(NUCLEOTIDE_T, fh_fastafs); - MD5_Update(&s->ctx, nt, 1);// this needs to be pu in add_nucleotide + s->add_ACTG(3, fh_fastafs); + MD5_Update(&s->ctx, nt, 1); break; - case 'u':// lower case = m block case 't': if(!s->in_m_block) { //printf("starting M block: %d\n", s->N + s->n_actg); @@ -172,68 +253,231 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string s->in_m_block = true; } - s->add_ACTG(NUCLEOTIDE_T, fh_fastafs); - MD5_Update(&s->ctx, nt, 1);// this needs to be pu in add_nucleotide + s->add_ACTG(3, fh_fastafs); + MD5_Update(&s->ctx, nt, 1); break; - case 'C': + case 'U': if(s->in_m_block) { //printf("ending M block: %d\n", s->N + s->n_actg - 1); s->m_block_ends.push_back(s->N + s->n_actg - 1); s->in_m_block = false; } - s->add_ACTG(NUCLEOTIDE_C, fh_fastafs); - MD5_Update(&s->ctx, nc, 1); + s->add_ACTG(4, fh_fastafs); + MD5_Update(&s->ctx, nu, 1); break; - case 'c': + case 'u': if(!s->in_m_block) { //printf("starting M block: %d\n", s->N + s->n_actg); s->m_block_starts.push_back(s->N + s->n_actg); s->in_m_block = true; } - s->add_ACTG(NUCLEOTIDE_C, fh_fastafs); - MD5_Update(&s->ctx, nc, 1); + s->add_ACTG(4, fh_fastafs); + MD5_Update(&s->ctx, nu, 1); break; - case 'A': + +// ==================================================== + + case 'R': if(s->in_m_block) { //printf("ending M block: %d\n", s->N + s->n_actg - 1); s->m_block_ends.push_back(s->N + s->n_actg - 1); s->in_m_block = false; } - s->add_ACTG(NUCLEOTIDE_A, fh_fastafs); - MD5_Update(&s->ctx, na, 1); + s->add_ACTG(5, fh_fastafs); + MD5_Update(&s->ctx, nr, 1); break; - case 'a': + case 'r': if(!s->in_m_block) { //printf("starting M block: %d\n", s->N + s->n_actg); s->m_block_starts.push_back(s->N + s->n_actg); s->in_m_block = true; } - s->add_ACTG(NUCLEOTIDE_A, fh_fastafs); - MD5_Update(&s->ctx, na, 1); + s->add_ACTG(5, fh_fastafs); + MD5_Update(&s->ctx, nr, 1); break; - case 'G': + case 'Y': if(s->in_m_block) { //printf("ending M block: %d\n", s->N + s->n_actg - 1); s->m_block_ends.push_back(s->N + s->n_actg - 1); s->in_m_block = false; } - s->add_ACTG(NUCLEOTIDE_G, fh_fastafs); - MD5_Update(&s->ctx, ng, 1); + s->add_ACTG(6, fh_fastafs); + MD5_Update(&s->ctx, ny, 1); break; - case 'g': + case 'y': if(!s->in_m_block) { //printf("starting M block: %d\n", s->N + s->n_actg); s->m_block_starts.push_back(s->N + s->n_actg); s->in_m_block = true; } - s->add_ACTG(NUCLEOTIDE_G, fh_fastafs); - MD5_Update(&s->ctx, ng, 1); + s->add_ACTG(6, fh_fastafs); + MD5_Update(&s->ctx, ny, 1); + break; + case 'K': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(7, fh_fastafs); + MD5_Update(&s->ctx, nk, 1); + break; + case 'k': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(7, fh_fastafs); + MD5_Update(&s->ctx, nk, 1); + break; + case 'M': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(8, fh_fastafs); + MD5_Update(&s->ctx, nm, 1); + break; + case 'm': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(8, fh_fastafs); + MD5_Update(&s->ctx, nm, 1); + break; + case 'S': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(9, fh_fastafs); + MD5_Update(&s->ctx, ns, 1); + break; + case 's': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(9, fh_fastafs); + MD5_Update(&s->ctx, ns, 1); + break; + case 'W': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(10, fh_fastafs); + MD5_Update(&s->ctx, nw, 1); + break; + case 'w': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(10, fh_fastafs); + MD5_Update(&s->ctx, nw, 1); + break; + case 'B': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(11, fh_fastafs); + MD5_Update(&s->ctx, nb, 1); + break; + case 'b': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(11, fh_fastafs); + MD5_Update(&s->ctx, nb, 1); + break; + case 'D': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(12, fh_fastafs); + MD5_Update(&s->ctx, nd, 1); + break; + case 'd': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(12, fh_fastafs); + MD5_Update(&s->ctx, nd, 1); + break; + case 'H': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(13, fh_fastafs); + MD5_Update(&s->ctx, nh, 1); + break; + case 'h': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(13, fh_fastafs); + MD5_Update(&s->ctx, nh, 1); + break; + case 'V': + if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + } + + s->add_ACTG(14, fh_fastafs); + MD5_Update(&s->ctx, nv, 1); + break; + case 'v': + if(!s->in_m_block) { + //printf("starting M block: %d\n", s->N + s->n_actg); + s->m_block_starts.push_back(s->N + s->n_actg); + s->in_m_block = true; + } + + s->add_ACTG(14, fh_fastafs); + MD5_Update(&s->ctx, nv, 1); break; case 'N': if(s->in_m_block) { @@ -242,7 +486,7 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string s->in_m_block = false; } - s->add_N(); + s->add_ACTG(15, fh_fastafs); MD5_Update(&s->ctx, nn, 1); break; case 'n': @@ -252,9 +496,10 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string s->in_m_block = true; } - s->add_N(); + s->add_ACTG(15, fh_fastafs); MD5_Update(&s->ctx, nn, 1); break; + default: std::cerr << "invalid chars in FASTA file" << std::endl; exit(1); diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 90ad5b45..de388f37 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -139,5 +139,11 @@ BOOST_AUTO_TEST_CASE(Test_size) +BOOST_AUTO_TEST_CASE(test_cache) +{ + size_t written = fasta_to_fourbit_fastafs("test/data/test_004.fa", "tmp/test_004.fastafs"); +} + + BOOST_AUTO_TEST_SUITE_END() From 70df4fd11f2112b5012a723101ec143e99c60899 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 28 Nov 2019 09:38:09 +0100 Subject: [PATCH 013/119] change --- src/fasta_to_fourbit_fastafs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index 0d313ec4..e6bbdb33 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -145,6 +145,7 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string std::string line; std::ifstream fh_fasta(fasta_file.c_str(), std::ios :: in | std::ios :: binary); std::ofstream fh_fastafs(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); + s = nullptr; if(fh_fasta.is_open() and fh_fastafs.is_open()) { fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; @@ -152,7 +153,6 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string fh_fastafs << "\x00\x00\x00\x00"s;// position of metedata ~ unknown YET // iterate until first sequence is found, ensuring we won't write to uninitialized sequences - s = nullptr; while(s == nullptr and getline(fh_fasta, line)) { if(line[0] == '>') { From a4393886507ca3e8c3ffc294e546667e76e774f6 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 28 Nov 2019 09:39:28 +0100 Subject: [PATCH 014/119] sav --- src/fasta_to_twobit_fastafs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index b28bd3ef..57918a32 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -123,6 +123,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f std::string line; std::ifstream fh_fasta(fasta_file.c_str(), std::ios :: in | std::ios :: binary); std::ofstream fh_fastafs(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); + s = nullptr; if(fh_fasta.is_open() and fh_fastafs.is_open()) { fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; @@ -130,7 +131,6 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f fh_fastafs << "\x00\x00\x00\x00"s;// position of metedata ~ unknown YET // iterate until first sequence is found, ensuring we won't write to uninitialized sequences - s = nullptr; while(s == nullptr and getline(fh_fasta, line)) { if(line[0] == '>') { line.erase(0, 1);// erases first part, quicker would be pointer from first char From da954d5ca6edba1a2c908f91317e715869e0efa0 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 28 Nov 2019 13:46:06 +0100 Subject: [PATCH 015/119] closing in --- src/fourbit_byte.cpp | 7 ++++++- test/fourbit_byte/test_fourbit_byte.cpp | 23 ++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp index 8673aa6f..09587e14 100644 --- a/src/fourbit_byte.cpp +++ b/src/fourbit_byte.cpp @@ -42,7 +42,12 @@ not sure what the quickest way is - this way all calculations are done as ints, */ unsigned char fourbit_byte::iterator_to_offset(uint32_t iterator) { - return (unsigned char)((3 - (iterator % 4)) * 2); + if(iterator % 2 == 0) { + return 4; + } + else { + return 0; + } } // @todo, offset needs to be second parameter diff --git a/test/fourbit_byte/test_fourbit_byte.cpp b/test/fourbit_byte/test_fourbit_byte.cpp index bf2806bc..831bf828 100644 --- a/test/fourbit_byte/test_fourbit_byte.cpp +++ b/test/fourbit_byte/test_fourbit_byte.cpp @@ -93,17 +93,22 @@ BOOST_AUTO_TEST_CASE(test_fourbit_conversions) BOOST_AUTO_TEST_CASE(test_fourbit_static_offset_conversion_test) { - BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(0), 6); - BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(1), 4); - BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(2), 2); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(0), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(1), 0); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(2), 4); BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(3), 0); - BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(4), 6); - BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(5), 4); - BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(6), 2); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(4), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(5), 0); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(6), 4); BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(7), 0); - BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(8), 6); - BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(9), 4); - BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(10), 2); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(8), 4); + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(9), 0); + + BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(10), 4); BOOST_CHECK_EQUAL(fourbit_byte::iterator_to_offset(11), 0); } From dddb3eb0141204dfd16c5c97dede6b4344e72c2c Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 28 Nov 2019 14:01:19 +0100 Subject: [PATCH 016/119] sav --- src/fasta_to_fourbit_fastafs.cpp | 24 +++++++++++++----------- src/fasta_to_twobit_fastafs.cpp | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index e6bbdb33..0944580a 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -33,13 +33,6 @@ const static char nn[2] = "N"; void fasta_seq_header_fourbit_conversion_data::add_ACTG(unsigned char nucleotide, std::ofstream &fh_fastafs) { - std::cout << this->n_actg; - std::cout << "\n"; - std::cout << " => " << std::to_string(fourbit_byte::iterator_to_offset(this->n_actg)) << "\n\n"; - - // 0 -> 4 - // 1 -> 0 toch? - this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), nucleotide);//0 = TU, 1 = // if fourth nucleotide, 2bit is complete; write to disk @@ -73,9 +66,8 @@ void fasta_seq_header_fourbit_conversion_data::finish_sequence(std::ofstream &fh // flush last nucleotide if(this->n_actg % 2 != 0) { - //for(j = this->n_actg % 2; j < 1; j++) { - this->fourbit_data.set(fourbit_byte::iterator_to_offset(j), 0); - //} + this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), 0); + fh_fastafs << this->fourbit_data.data; } @@ -499,6 +491,16 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string s->add_ACTG(15, fh_fastafs); MD5_Update(&s->ctx, nn, 1); break; + case '-': + /*if(s->in_m_block) { + //printf("ending M block: %d\n", s->N + s->n_actg - 1); + s->m_block_ends.push_back(s->N + s->n_actg - 1); + s->in_m_block = false; + }*/ + + s->add_N(); + //MD5_Update(&s->ctx, nn, 1); + break; default: std::cerr << "invalid chars in FASTA file" << std::endl; @@ -525,7 +527,7 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string s = index[i]; // flag - fh_fastafs << "\x00\x08"s; + fh_fastafs << "\x00\x0A"s;// 00001010 (IUPEC + completed-with-checksum) // name unsigned char name_size = (unsigned char) s->name.size(); diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index 57918a32..f6f3081b 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -280,7 +280,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f s = index[i]; // flag - fh_fastafs << "\x00\x08"s; + fh_fastafs << "\x00\x08"s;// 00001000 (DNA + completed-with-checksum) // name unsigned char name_size = (unsigned char) s->name.size(); From cbfb136fb533d7317ea08477208acc129dc0f938 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 28 Nov 2019 14:45:15 +0100 Subject: [PATCH 017/119] testing returned length is correct --- src/fasta_to_fourbit_fastafs.cpp | 2 +- src/fasta_to_twobit_fastafs.cpp | 3 ++- src/fourbit_byte.cpp | 1 + test/cache/test_cache_fourbit.cpp | 36 +++++++++++++++++++++++++++++++ test/data/test_004.fa | 14 ++++-------- 5 files changed, 44 insertions(+), 12 deletions(-) diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index 0944580a..4954bd8c 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -527,7 +527,7 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string s = index[i]; // flag - fh_fastafs << "\x00\x0A"s;// 00001010 (IUPEC + completed-with-checksum) + fh_fastafs << "\x00\x0A"s;// 00001010 (IUPEC + completed-with-checksum) | this probably has to be mirrored as last and first bit are swapped // name unsigned char name_size = (unsigned char) s->name.size(); diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index f6f3081b..09dc1b2d 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -153,6 +153,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f for(std::string::iterator it = line.begin(); it != line.end(); ++it) { switch(*it) { + // keeping daling with upper-case and lower-case in separate cases is quicker than one if/else before the switch, simply beacuse switches are faster than if-statements. case 'U': case 'T': if(s->in_m_block) { @@ -280,7 +281,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f s = index[i]; // flag - fh_fastafs << "\x00\x08"s;// 00001000 (DNA + completed-with-checksum) + fh_fastafs << "\x00\x08"s;// 00001000 (DNA + completed-with-checksum) | this probably has to be mirrored as last and first bit are swapped // name unsigned char name_size = (unsigned char) s->name.size(); diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp index 09587e14..775604a7 100644 --- a/src/fourbit_byte.cpp +++ b/src/fourbit_byte.cpp @@ -237,6 +237,7 @@ char *fourbit_byte::get(unsigned char length) seq[i] = fourbit_byte::fourbit_hash[this->data][i]; } seq[length] = '\0'; + return seq; } diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index de388f37..1f2dc171 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -142,6 +142,42 @@ BOOST_AUTO_TEST_CASE(Test_size) BOOST_AUTO_TEST_CASE(test_cache) { size_t written = fasta_to_fourbit_fastafs("test/data/test_004.fa", "tmp/test_004.fastafs"); + + static std::string reference = + // GENERIC-HEADER + "\x0F\x0A\x46\x53"s// [0, 3] + "\x00\x00\x00\x00"s// [4, 7] version + "\x00\x01"s// [8, 9] FASTAFS flag [ 00000000 | 00000001 ] + "\x00\x00\x01\x37"s // [10, 13] index position in file (???) + + // DATA + "\x00\x00\x00\x10"s// [14, 17] seq length (76) + "\x00\x55\xAA\xFF\x00\x00\x00\x00\x00\x00"s// 1 sequence (four bit format; n chars = 76/2 = 38) + "\x00\x55\xAA\xFF\x00\x00\x00\x00\x00\x00"s// 2 + "\x00\x55\xAA\xFF\x00\x00\x00\x00\x00\x00"s// 3 + "\x00\x55\xAA\xFF\x00\x00\x00\x00"s// 4 + "\x00\x55\xAA\xFF\x00"s// 12 + "\x00\x00\x00\x00"s// [22, 25] n-blocks (2) + "\x00\x00\x00\x00"s// [50, 53] n-block[0] starts (0) + "\x00\x00\x00\x0F"s// [54, 57] n-block[0] starts (15) + "\x00\x00\x00\x00"s// [50, 53] n-block[1] starts (0) + "\x00\x00\x00\x0F"s// [54, 57] n-block[1] starts (15) + "\x75\x25\x5C\x6D\x90\x77\x89\x99\xAD\x36\x43\xA2\xE6\x9D\x43\x44"s// [26, 45] checksum + "\x00\x00\x00\x01"s// [46, 49] m-blocks (1) + "\x00\x00\x00\x00"s// [50, 53] m-block starts (0) + "\x00\x00\x00\x0F"s// [54, 57] m-block starts (15) + + // INDEX + "\x00\x00\x00\x01"s // [339, 342] 1 sequences + "\x00\xA0" // [343, 344] complete, IUPEC + "\x05"s "IUPAC"s // [345, 349] name + "\x00\x00\x00\x0E"s // [350, 353] data position in file (14) + + // METADATA + "\x00" // [399] no metadata fields [padding will come soon?] + ; + + BOOST_CHECK_EQUAL(written, 121); } diff --git a/test/data/test_004.fa b/test/data/test_004.fa index 3b713227..689dda8a 100644 --- a/test/data/test_004.fa +++ b/test/data/test_004.fa @@ -1,10 +1,4 @@ ->test IUB/IUPAC amino acid and nucleic acid codes -VYW-DVWYGBSWRRVDH-WHBYVYAUDAMNRCHYHDRGRBSNKAHSHCGYVBRS--GDSB -RRHMGWSGCDURK-URTVMSDCYYDDVYYNRKSWRHWRCNUSTTCCRKKTKATVNMMWMG -MWABHUUMBGTCHRGNYKBBBKHVHRVCNUSC-KMCSYDHDKK-AVMAD-KHTWAKMTAA -UWCDNGBRHHYCGSVHYSSTDWSGBRANUMMADDYMHYTURHSWSNBNRNWMAAGRCGRB -AKVYDCKKCUAWDNAUKMRVTDGGYGMKHYW-BADCKTNGBGDHGHMVKU-KSTWUGKKW -SMCYHUYAUGMRHMTUMWRARR--NR-GVBTVYVCWSNCGWRRCWWAGKAMAYCVK---C -GRGUVDCRHUSABANWSKAUCSYW-K-VCDM-DRGBDTUAHSWGUBHCAVCMUVUVHR-G -TAUCUUKWGNRCDYNNDCGGRCAAVAHDSDWMHNBWYRCBDWGVCDCGTMHNSHRVYMMK -YD-SBYSCHAGCACGNRYTSYUYKYA-TCDNAN-BRCVCN +>IUPAC +NBKAHMDCUWGSYVTRHGWVUMTBSDN----- +-----BGYADNHSMUTRCKWVsbhvdnrtgyc +mkwuaAVTSDKNB---UGWMHYRC From a2466cf610456bfbca4e1f958ec72be70ddc5da3 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 28 Nov 2019 14:56:08 +0100 Subject: [PATCH 018/119] sav --- test/cache/test_cache_fourbit.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 1f2dc171..c53dfd44 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -148,24 +148,24 @@ BOOST_AUTO_TEST_CASE(test_cache) "\x0F\x0A\x46\x53"s// [0, 3] "\x00\x00\x00\x00"s// [4, 7] version "\x00\x01"s// [8, 9] FASTAFS flag [ 00000000 | 00000001 ] - "\x00\x00\x01\x37"s // [10, 13] index position in file (???) + "\x00\x00\x00\x68"s // [10, 13] index position in file (104?) // DATA - "\x00\x00\x00\x10"s// [14, 17] seq length (76) + "\x00\x00\x00\x4C"s// [14, 17] seq length (76) "\x00\x55\xAA\xFF\x00\x00\x00\x00\x00\x00"s// 1 sequence (four bit format; n chars = 76/2 = 38) "\x00\x55\xAA\xFF\x00\x00\x00\x00\x00\x00"s// 2 "\x00\x55\xAA\xFF\x00\x00\x00\x00\x00\x00"s// 3 "\x00\x55\xAA\xFF\x00\x00\x00\x00"s// 4 "\x00\x55\xAA\xFF\x00"s// 12 - "\x00\x00\x00\x00"s// [22, 25] n-blocks (2) - "\x00\x00\x00\x00"s// [50, 53] n-block[0] starts (0) - "\x00\x00\x00\x0F"s// [54, 57] n-block[0] starts (15) - "\x00\x00\x00\x00"s// [50, 53] n-block[1] starts (0) - "\x00\x00\x00\x0F"s// [54, 57] n-block[1] starts (15) + "\x00\x00\x00\x02"s// [22, 25] n-blocks (2) + "\x00\x00\x00\x1A"s// [50, 53] n-block[0] starts (26) + "\x00\x00\x00\x24"s// [54, 57] n-block[0] ends (36|37) + "\x00\x00\x00\x4D"s// [50, 53] n-block[1] starts (77) + "\x00\x00\x00\x4F"s// [54, 57] n-block[1] ends (79) "\x75\x25\x5C\x6D\x90\x77\x89\x99\xAD\x36\x43\xA2\xE6\x9D\x43\x44"s// [26, 45] checksum "\x00\x00\x00\x01"s// [46, 49] m-blocks (1) - "\x00\x00\x00\x00"s// [50, 53] m-block starts (0) - "\x00\x00\x00\x0F"s// [54, 57] m-block starts (15) + "\x00\x00\x00\x00"s// [50, 53] m-block starts (53) + "\x00\x00\x00\x0F"s// [54, 57] m-block starts (65) // INDEX "\x00\x00\x00\x01"s // [339, 342] 1 sequences From aeff712caaa07c5ab5311f8be0b2ecba08072a18 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 28 Nov 2019 15:25:47 +0100 Subject: [PATCH 019/119] contents tested, view not yet done --- src/fasta_to_fourbit_fastafs.cpp | 1 + test/cache/test_cache_fourbit.cpp | 65 +++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index 4954bd8c..f7503b64 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -519,6 +519,7 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string // write index/footer unsigned int index_file_position = (uint32_t) fh_fastafs.tellp(); + //std::cout << " index file pos: " + std::to_string(index_file_position) + " \n"; char buffer[4 + 1]; uint_to_fourbytes(buffer, (uint32_t) index.size()); fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index c53dfd44..8cb85909 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -151,33 +151,56 @@ BOOST_AUTO_TEST_CASE(test_cache) "\x00\x00\x00\x68"s // [10, 13] index position in file (104?) // DATA - "\x00\x00\x00\x4C"s// [14, 17] seq length (76) - "\x00\x55\xAA\xFF\x00\x00\x00\x00\x00\x00"s// 1 sequence (four bit format; n chars = 76/2 = 38) - "\x00\x55\xAA\xFF\x00\x00\x00\x00\x00\x00"s// 2 - "\x00\x55\xAA\xFF\x00\x00\x00\x00\x00\x00"s// 3 - "\x00\x55\xAA\xFF\x00\x00\x00\x00"s// 4 - "\x00\x55\xAA\xFF\x00"s// 12 - "\x00\x00\x00\x02"s// [22, 25] n-blocks (2) - "\x00\x00\x00\x1A"s// [50, 53] n-block[0] starts (26) - "\x00\x00\x00\x24"s// [54, 57] n-block[0] ends (36|37) - "\x00\x00\x00\x4D"s// [50, 53] n-block[1] starts (77) - "\x00\x00\x00\x4F"s// [54, 57] n-block[1] ends (79) - "\x75\x25\x5C\x6D\x90\x77\x89\x99\xAD\x36\x43\xA2\xE6\x9D\x43\x44"s// [26, 45] checksum - "\x00\x00\x00\x01"s// [46, 49] m-blocks (1) - "\x00\x00\x00\x00"s// [50, 53] m-block starts (53) - "\x00\x00\x00\x0F"s// [54, 57] m-block starts (65) + "\x00\x00\x00\x4B"s// [14, 17] seq length (75) + "\xFB\x70\xD8\xC1\x4A\x29\x6E\x35\xD2\xAE"s// [18, 27] sequence (four bit format; n chars = 76/2 = 38) + "\x48\x3B\x9C\xFB\x26\x0C\xFD\x98\x43\x51"s// [28, 37] + "\x7A\xE9\xBD\xEC\xF5\x32\x61\x87\xA4\x00"s// [38, 47] + "\xE3\x9C\x7F\xB4\x2A\x8D\x65\x10"s// [48, 56] + "\x00\x00\x00\x02"s// [, ] n-blocks (2) + "\x00\x00\x00\x1B"s// [, ] n-block[0] starts (27) + "\x00\x00\x00\x4D"s// [, ] n-block[1] starts (77) + "\x00\x00\x00\x24"s// [, ] n-block[0] ends (36|37) + "\x00\x00\x00\x4F"s// [, ] n-block[1] ends (79) + "\xEE\x09\x2F\x63\x4F\x6C\x87\xD0\x6B\x57\x1F\x07\xD1\x42\x73\x00"s// [76, ] checksum + "\x00\x00\x00\x01"s// [92, ] m-blocks (1) + "\x00\x00\x00\x35"s// [96, ] m-block starts (53) + "\x00\x00\x00\x44"s// [100, ] m-block starts (68) // INDEX - "\x00\x00\x00\x01"s // [339, 342] 1 sequences - "\x00\xA0" // [343, 344] complete, IUPEC - "\x05"s "IUPAC"s // [345, 349] name - "\x00\x00\x00\x0E"s // [350, 353] data position in file (14) + "\x00\x00\x00\x01"s // [104, ] 1 sequences + "\x00\x0A" // [, ] complete, IUPEC + "\x05"s "IUPAC"s // [, ] name + "\x00\x00\x00\x0E"s // [, ] data position in file (14) // METADATA - "\x00" // [399] no metadata fields [padding will come soon?] + "\x00" // [120] no metadata fields [padding will come soon?] ; - BOOST_CHECK_EQUAL(written, 121); + BOOST_CHECK_EQUAL(written, 121); + + //BOOST_CHECK(output.compare(uppercase) == 0 or output.compare(mixedcase) == 0); + std::ifstream file("tmp/test_004.fastafs", std::ios::in | std::ios::binary | std::ios::ate); + BOOST_REQUIRE(file.is_open()); + + std::streampos size; + char * buffer; + size = file.tellg(); + buffer = new char [size]; + + file.seekg(0, std::ios::beg); + file.read(buffer, size); + file.close(); + for(unsigned int i = 0; i < size; i++) { + BOOST_CHECK_EQUAL(buffer[i], reference[i]); + + if(reference[i] != buffer[i]) { + printf("comparing char %i\n", i); + printf(" ** mismatch [%d] [ref] %d != [buf] %d (%c x %02hhX)\n", i, reference[i], buffer[i], buffer[i], buffer[i]); + } + + } + + delete[] buffer; } From 719fb2bab39b8cee4bee8535770ee79091214936 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Fri, 29 Nov 2019 16:46:14 +0100 Subject: [PATCH 020/119] sav --- doc/4bit.txt | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 doc/4bit.txt diff --git a/doc/4bit.txt b/doc/4bit.txt new file mode 100644 index 00000000..8e24e65d --- /dev/null +++ b/doc/4bit.txt @@ -0,0 +1,18 @@ +A 0000 +B 0001 +C 0010 +D 0011 +G 0100 +H 0101 +K 0110 +M 0111 +N 1000 +R 1001 +S 1010 +T 1011 +U 1100 +V 1101 +W 1110 +Y 1111 + +- by idx From 2fcc10692c098bd3932708342446450e7612281f Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Fri, 29 Nov 2019 16:52:20 +0100 Subject: [PATCH 021/119] sav --- doc/FASTAFS-FORMAT-SPECIFICATION.md | 8 ++-- include/fastafs.hpp | 59 +++++++++++++++++++++++- src/fastafs.cpp | 17 ++++++- test/fourbit_byte/test_four_byte.cpp | 68 ++++++++++++++++++++++++++++ 4 files changed, 146 insertions(+), 6 deletions(-) create mode 100644 test/fourbit_byte/test_four_byte.cpp diff --git a/doc/FASTAFS-FORMAT-SPECIFICATION.md b/doc/FASTAFS-FORMAT-SPECIFICATION.md index 9169724f..bfdf4810 100644 --- a/doc/FASTAFS-FORMAT-SPECIFICATION.md +++ b/doc/FASTAFS-FORMAT-SPECIFICATION.md @@ -21,7 +21,7 @@ If this metadata would be written in the header located before the sequence data | GENERIC-HEADER | | | | | | [MAGIC](#magic) | 4 bytes | `x0F x0A x46 x53` | | | [FILE FORMAT VERSION](#file-format-version) | [4-byte integer](#four-byte-integer) | `x00 x00 x00 x00` | -| | [FASTAFS-FLAG](#fastafs-flag) | 2 bytes | Certain binary flags | +| | [FASTAFS-FLAGS](#fastafs-flags) | 2 bytes | Certain binary flags | | | [FILE-POSITION-OF-INDEX](#file-position-of-the-index) | [4-byte integer](#four-byte-integer) | Location in the file (offset in bytes from beginning) where the INDEX is located | | DATA | --- | --- | --- | | -> per sequence | @@ -40,7 +40,7 @@ If this metadata would be written in the header located before the sequence data | INDEX | --- | --- | | | | NUMBER-SEQUENCES | uint32_t as [4-byte integer](#four-byte-integer) | Number of sequences included | | -> per sequence | -| | [SEQUENCE-FLAG](#sequence-flag) | 2 bytes | storing metadata and type of data | +| | [SEQUENCE-FLAGS](#sequence-flags) | 2 bytes | storing metadata and type of data | | | NAME-LENGTH | 1 byte as unsigned char | length in bytes; name cannot exceed 255 bytes | | | NAME-FASTA | NAME-LENGTH x char | FASTA header; may not include new-lines or '>' | | | START-POSITION-IN-BODY of N-COMPR-NUC | uint32_t as [4-byte integer](#four-byte-integer) | Location in the file (offset in bytes from beginning) where the DATA block for this sequence starts | @@ -80,7 +80,7 @@ The bit representation of these bytes are: +--------+--------+--------+--------+ ``` -#### FASTAFS-FLAG #### +#### FASTAFS-FLAGS #### ``` bit 0 file-complete @@ -115,7 +115,7 @@ The index is located at the end of the data. This file offset in bytes from the Repeated for every sequence, in order matching SEQUENCE-HEADER -#### SEQUENCE-FLAG #### +#### SEQUENCE-FLAGS #### The sequence flag allows to describe the following metadata for each sequence: diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 7335a13a..c3f9b48c 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -10,6 +10,63 @@ #define FASTAFS_HPP + + +class fastafs_flags { + private: + unsigned char bits[2];// 00000000 00000000 + + public: + void set(char*); +}; + +class fastafs_sequence_flags { + private: + unsigned char bits[2];// 00000000 00000000 + + // set by flag + void set_flag(unsigned char, bool);// counting flag from bit 0(!) + + + public: + void set(char*); + + bool is_dna(); // alphabet: 'ACTG' + 'N' + bool is_rna(); // alphabet: 'ACUG' + 'N' + bool is_iupec_nucleotide(); // alphabet: 'ACGTURYKMSWBDHVN' + '-' + + bool is_complete(); + bool is_incomplete() {return !this->is_complete(); }; + + bool is_linear(); + bool is_circular() {return !this->is_linear(); }; + + bool get_flag(unsigned char);// counting flag position from bit 0 + + + // set by entity + void set_dna(); + void set_rna(); + void set_iupec_nucleotide(); + + void set_complete(); + void set_incomplete(); + + void set_linear(); + void set_circular(); + +}; + + + + + + + + + + + struct ffs2f_init_seq { // fasta seq size // fasta seq newlines/padding lines @@ -55,7 +112,7 @@ class fastafs_seq uint32_t n;// number nucleotides std::vector n_starts;// start positions (nucleotide positions; 0-based) std::vector n_ends;// end positions (nucleotide positions; 0-based) - uint16_t flag; + fastafs_flags flags; std::vector m_starts;// start positions (nucleotide positions; 0-based) std::vector m_ends;// end positions (nucleotide positions; 0-based) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index d786e02b..8eb4d56a 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -35,6 +35,20 @@ + +void fastafs_flags::set(char *data) { + this->bits[0] = data[0]; + this->bits[1] = data[1]; +} + +void fastafs_sequence_flags::set(char *data) { + this->bits[0] = data[0]; + this->bits[1] = data[1]; +} + + + + static const std::string dict_sq = "@SQ\tSN:"; static const std::string dict_ln = "\tLN:"; static const std::string dict_m5 = "\tM5:"; @@ -539,7 +553,8 @@ void fastafs::load(std::string afilename) // flag file.read(memblock, 2); - s->flag = twobytes_to_uint(memblock); + s->flags = fastafs_flags();//twobytes_to_uint(memblock); + s->flags.set(memblock); // name length file.read(memblock, 1); diff --git a/test/fourbit_byte/test_four_byte.cpp b/test/fourbit_byte/test_four_byte.cpp new file mode 100644 index 00000000..eb099cfb --- /dev/null +++ b/test/fourbit_byte/test_four_byte.cpp @@ -0,0 +1,68 @@ +#define BOOST_TEST_MODULE fourbit_byte + +#include + +#include "config.hpp" + +#include "fourbit_byte.hpp" + + +BOOST_AUTO_TEST_SUITE(Testing) + + +BOOST_AUTO_TEST_CASE(test_twobit_conversions) +{ + char seq[5]; + seq[4] = '\0'; + twobit_byte t; + seq[0] = 'A'; + seq[1] = 'A'; + seq[2] = 'A'; + seq[3] = 'A'; + t.set(seq);//10101010 = 170 + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); + BOOST_CHECK_EQUAL(t.data, 170); + + seq[0] = 'T'; + seq[1] = 'A'; + seq[2] = 'A'; + seq[3] = 'A'; + t.set(seq); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); + BOOST_CHECK_EQUAL(t.data, 42); + seq[0] = 'A'; + seq[1] = 'C'; + seq[2] = 'T'; + seq[3] = 'G'; + t.set(seq); + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); + BOOST_CHECK_EQUAL(t.data, 147); + seq[0] = 'N'; + seq[1] = 'C'; + seq[2] = 'T'; + seq[3] = 'N'; + t.set(seq);//00 01 00 00 + printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); + BOOST_CHECK_EQUAL(t.data, 16); +} + +BOOST_AUTO_TEST_CASE(test_twobit_static_offset_conversion_test) +{ + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(0), 6); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(1), 4); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(2), 2); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(3), 0); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(4), 6); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(5), 4); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(6), 2); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(7), 0); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(8), 6); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(9), 4); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(10), 2); + BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(11), 0); +} + + + + +BOOST_AUTO_TEST_SUITE_END() From 16fb91779c0b0ca1a16913f037ca17ce89596fc6 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Fri, 29 Nov 2019 16:52:35 +0100 Subject: [PATCH 022/119] tidy --- include/fastafs.hpp | 74 +++++---- include/fourbit_byte.hpp | 2 +- src/fasta_to_fourbit_fastafs.cpp | 12 +- src/fasta_to_twobit_fastafs.cpp | 4 +- src/fastafs.cpp | 6 +- src/fourbit_byte.cpp | 261 +++++++++++++++--------------- src/fuse.cpp | 2 +- src/lsfastafs.cpp | 4 +- src/twobit_byte.cpp | 18 +-- src/utils.cpp | 42 ++--- test/cache/test_cache_fourbit.cpp | 24 +-- 11 files changed, 230 insertions(+), 219 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index c3f9b48c..e1ea847d 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -12,48 +12,56 @@ -class fastafs_flags { - private: - unsigned char bits[2];// 00000000 00000000 - - public: - void set(char*); +class fastafs_flags +{ +private: + unsigned char bits[2];// 00000000 00000000 + +public: + void set(char*); }; -class fastafs_sequence_flags { - private: - unsigned char bits[2];// 00000000 00000000 +class fastafs_sequence_flags +{ +private: + unsigned char bits[2];// 00000000 00000000 + + // set by flag + void set_flag(unsigned char, bool);// counting flag from bit 0(!) - // set by flag - void set_flag(unsigned char, bool);// counting flag from bit 0(!) +public: + void set(char*); + + bool is_dna(); // alphabet: 'ACTG' + 'N' + bool is_rna(); // alphabet: 'ACUG' + 'N' + bool is_iupec_nucleotide(); // alphabet: 'ACGTURYKMSWBDHVN' + '-' + + bool is_complete(); + bool is_incomplete() + { + return !this->is_complete(); + }; + + bool is_linear(); + bool is_circular() + { + return !this->is_linear(); + }; - public: - void set(char*); - - bool is_dna(); // alphabet: 'ACTG' + 'N' - bool is_rna(); // alphabet: 'ACUG' + 'N' - bool is_iupec_nucleotide(); // alphabet: 'ACGTURYKMSWBDHVN' + '-' - - bool is_complete(); - bool is_incomplete() {return !this->is_complete(); }; - - bool is_linear(); - bool is_circular() {return !this->is_linear(); }; - - bool get_flag(unsigned char);// counting flag position from bit 0 + bool get_flag(unsigned char);// counting flag position from bit 0 - // set by entity - void set_dna(); - void set_rna(); - void set_iupec_nucleotide(); + // set by entity + void set_dna(); + void set_rna(); + void set_iupec_nucleotide(); - void set_complete(); - void set_incomplete(); + void set_complete(); + void set_incomplete(); - void set_linear(); - void set_circular(); + void set_linear(); + void set_circular(); }; diff --git a/include/fourbit_byte.hpp b/include/fourbit_byte.hpp index a1a41145..aa826406 100644 --- a/include/fourbit_byte.hpp +++ b/include/fourbit_byte.hpp @@ -8,7 +8,7 @@ class fourbit_byte { public: - static const char fourbit_alhpabet[17]; + static const char fourbit_alhpabet[17]; static const char fourbit_hash[256][3]; unsigned char data; diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index f7503b64..79b2d242 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -66,8 +66,8 @@ void fasta_seq_header_fourbit_conversion_data::finish_sequence(std::ofstream &fh // flush last nucleotide if(this->n_actg % 2 != 0) { - this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), 0); - + this->fourbit_data.set(fourbit_byte::iterator_to_offset(this->n_actg), 0); + fh_fastafs << this->fourbit_data.data; } @@ -137,7 +137,7 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string std::string line; std::ifstream fh_fasta(fasta_file.c_str(), std::ios :: in | std::ios :: binary); std::ofstream fh_fastafs(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); - s = nullptr; + s = nullptr; if(fh_fasta.is_open() and fh_fastafs.is_open()) { fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; @@ -147,8 +147,8 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string // iterate until first sequence is found, ensuring we won't write to uninitialized sequences while(s == nullptr and getline(fh_fasta, line)) { if(line[0] == '>') { - - // init new sequence + + // init new sequence line.erase(0, 1);// erases first part, quicker would be pointer from first char s = new fasta_seq_header_fourbit_conversion_data(fh_fastafs.tellp(), line); fh_fastafs << "\x00\x00\x00\x00"s;// placeholder for sequence length @@ -160,7 +160,7 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string while(getline(fh_fasta, line)) { if(line[0] == '>') { s->finish_sequence(fh_fastafs); - + // init sequence line.erase(0, 1);// erases first part, quicker would be pointer from first char s = new fasta_seq_header_fourbit_conversion_data(fh_fastafs.tellp(), line); diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index 09dc1b2d..ead6ce0f 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -123,7 +123,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f std::string line; std::ifstream fh_fasta(fasta_file.c_str(), std::ios :: in | std::ios :: binary); std::ofstream fh_fastafs(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); - s = nullptr; + s = nullptr; if(fh_fasta.is_open() and fh_fastafs.is_open()) { fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; @@ -153,7 +153,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f for(std::string::iterator it = line.begin(); it != line.end(); ++it) { switch(*it) { - // keeping daling with upper-case and lower-case in separate cases is quicker than one if/else before the switch, simply beacuse switches are faster than if-statements. + // keeping daling with upper-case and lower-case in separate cases is quicker than one if/else before the switch, simply beacuse switches are faster than if-statements. case 'U': case 'T': if(s->in_m_block) { diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 8eb4d56a..c1ee56f7 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -36,12 +36,14 @@ -void fastafs_flags::set(char *data) { +void fastafs_flags::set(char *data) +{ this->bits[0] = data[0]; this->bits[1] = data[1]; } -void fastafs_sequence_flags::set(char *data) { +void fastafs_sequence_flags::set(char *data) +{ this->bits[0] = data[0]; this->bits[1] = data[1]; } diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp index 775604a7..da44544e 100644 --- a/src/fourbit_byte.cpp +++ b/src/fourbit_byte.cpp @@ -42,12 +42,11 @@ not sure what the quickest way is - this way all calculations are done as ints, */ unsigned char fourbit_byte::iterator_to_offset(uint32_t iterator) { - if(iterator % 2 == 0) { - return 4; - } - else { - return 0; - } + if(iterator % 2 == 0) { + return 4; + } else { + return 0; + } } // @todo, offset needs to be second parameter @@ -67,67 +66,67 @@ void fourbit_byte::set(unsigned char bit_offset, unsigned char nucleotide) switch(nucleotide) { case 0:// A (0000) - this->data = (unsigned char)(this->data & ~( (8+4+2+1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data & ~((8 + 4 + 2 + 1) << bit_offset)); // set zero's break; case 1:// C (0001) - this->data = (unsigned char)(this->data & ~( (8+4+2 ) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( ( 1) << bit_offset)); // set one's + this->data = (unsigned char)(this->data & ~((8 + 4 + 2) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((1) << bit_offset)); // set one's break; case 2:// G (0010) - this->data = (unsigned char)(this->data & ~( (8+4 +1) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( ( 2 ) << bit_offset)); // set one's + this->data = (unsigned char)(this->data & ~((8 + 4 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((2) << bit_offset)); // set one's break; case 3:// T (0011) - this->data = (unsigned char)(this->data & ~( (8+4 ) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( ( 2+1) << bit_offset)); // set one's - break; - case 4:// U (0100) - this->data = (unsigned char)(this->data & ~( (8 +2+1) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( ( 4 ) << bit_offset)); // set one's - break; - case 5:// R (0101) - this->data = (unsigned char)(this->data & ~( (8 +2 ) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( ( 4 +1) << bit_offset)); // set one's - break; - case 6:// Y (0110) - this->data = (unsigned char)(this->data & ~( (8 +1) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( ( 4+2 ) << bit_offset)); // set one's - break; - case 7:// K (0111) - this->data = (unsigned char)(this->data & ~( (8 ) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( ( 4+2+1) << bit_offset)); // set one's - break; - case 8:// M (1000) - this->data = (unsigned char)(this->data & ~( ( 4+2+1) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( (8 ) << bit_offset)); // set one's - break; - case 9:// S (1001) - this->data = (unsigned char)(this->data & ~( ( 4+2 ) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( (8 +1) << bit_offset)); // set one's - break; - case 10:// W (1010) - this->data = (unsigned char)(this->data & ~( ( 4 +1) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( (8 +2 ) << bit_offset)); // set one's - break; - case 11:// B (1011) - this->data = (unsigned char)(this->data & ~( ( 4 ) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( (8 +2+1) << bit_offset)); // set one's - break; - case 12:// D (1100) - this->data = (unsigned char)(this->data & ~( ( 2+1) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( (8+4 ) << bit_offset)); // set one's - break; - case 13:// H (1101) - this->data = (unsigned char)(this->data & ~( ( 2 ) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( (8+4 +1) << bit_offset)); // set one's - break; - case 14:// V (1110) - this->data = (unsigned char)(this->data & ~( ( +1) << bit_offset)); // set zero's - this->data = (unsigned char)(this->data | ( (8+4+2 ) << bit_offset)); // set one's - break; - case 15:// N (1111) - this->data = (unsigned char)(this->data | ( (8+4+2+1) << bit_offset)); // set one's - break; + this->data = (unsigned char)(this->data & ~((8 + 4) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((2 + 1) << bit_offset)); // set one's + break; + case 4:// U (0100) + this->data = (unsigned char)(this->data & ~((8 + 2 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((4) << bit_offset)); // set one's + break; + case 5:// R (0101) + this->data = (unsigned char)(this->data & ~((8 + 2) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((4 + 1) << bit_offset)); // set one's + break; + case 6:// Y (0110) + this->data = (unsigned char)(this->data & ~((8 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((4 + 2) << bit_offset)); // set one's + break; + case 7:// K (0111) + this->data = (unsigned char)(this->data & ~((8) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((4 + 2 + 1) << bit_offset)); // set one's + break; + case 8:// M (1000) + this->data = (unsigned char)(this->data & ~((4 + 2 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8) << bit_offset)); // set one's + break; + case 9:// S (1001) + this->data = (unsigned char)(this->data & ~((4 + 2) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 1) << bit_offset)); // set one's + break; + case 10:// W (1010) + this->data = (unsigned char)(this->data & ~((4 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 2) << bit_offset)); // set one's + break; + case 11:// B (1011) + this->data = (unsigned char)(this->data & ~((4) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 2 + 1) << bit_offset)); // set one's + break; + case 12:// D (1100) + this->data = (unsigned char)(this->data & ~((2 + 1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 4) << bit_offset)); // set one's + break; + case 13:// H (1101) + this->data = (unsigned char)(this->data & ~((2) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 4 + 1) << bit_offset)); // set one's + break; + case 14:// V (1110) + this->data = (unsigned char)(this->data & ~((+1) << bit_offset)); // set zero's + this->data = (unsigned char)(this->data | ((8 + 4 + 2) << bit_offset)); // set one's + break; + case 15:// N (1111) + this->data = (unsigned char)(this->data | ((8 + 4 + 2 + 1) << bit_offset)); // set one's + break; #if DEBUG default: @@ -146,75 +145,75 @@ void fourbit_byte::set(char* buffer) for(unsigned char i = 0; i < 2; i++) { switch(buffer[i]) { - case 'A':// A (0000) - case 'a': - this->set(bit_offsets[i], 0); - break; - case 'C':// C (0001) - case 'c': - this->set(bit_offsets[i], 1); - break; - case 'G':// G (0010) - case 'g': - this->set(bit_offsets[i], 2); - break; - case 'T':// T (0011) - case 't': - this->set(bit_offsets[i], 3); - break; - case 'U':// U (0100) - case 'u': - this->set(bit_offsets[i], 4); - break; - case 'R':// R (0101) - case 'r': - this->set(bit_offsets[i], 5); - break; - case 'Y':// Y (0110) - case 'y': - this->set(bit_offsets[i], 6); - break; - case 'K':// K (0111) - case 'k': - this->set(bit_offsets[i], 7); - break; - case 'M':// M (1000) - case 'm': - this->set(bit_offsets[i], 8); - break; - case 'S':// S (1001) - case 's': - this->set(bit_offsets[i], 9); - break; - case 'W':// W (1010) - case 'w': - this->set(bit_offsets[i], 10); - break; - case 'B':// B (1011) - case 'b': - this->set(bit_offsets[i], 11); - break; - case 'D':// D (1100) - case 'd': - this->set(bit_offsets[i], 12); - break; - case 'H':// H (1101) - case 'h': - this->set(bit_offsets[i], 13); - break; - case 'V':// V (1110) - case 'v': - this->set(bit_offsets[i], 14); - break; - case 'N':// N (1111) - case 'n': - this->set(bit_offsets[i], 15); - break; + case 'A':// A (0000) + case 'a': + this->set(bit_offsets[i], 0); + break; + case 'C':// C (0001) + case 'c': + this->set(bit_offsets[i], 1); + break; + case 'G':// G (0010) + case 'g': + this->set(bit_offsets[i], 2); + break; + case 'T':// T (0011) + case 't': + this->set(bit_offsets[i], 3); + break; + case 'U':// U (0100) + case 'u': + this->set(bit_offsets[i], 4); + break; + case 'R':// R (0101) + case 'r': + this->set(bit_offsets[i], 5); + break; + case 'Y':// Y (0110) + case 'y': + this->set(bit_offsets[i], 6); + break; + case 'K':// K (0111) + case 'k': + this->set(bit_offsets[i], 7); + break; + case 'M':// M (1000) + case 'm': + this->set(bit_offsets[i], 8); + break; + case 'S':// S (1001) + case 's': + this->set(bit_offsets[i], 9); + break; + case 'W':// W (1010) + case 'w': + this->set(bit_offsets[i], 10); + break; + case 'B':// B (1011) + case 'b': + this->set(bit_offsets[i], 11); + break; + case 'D':// D (1100) + case 'd': + this->set(bit_offsets[i], 12); + break; + case 'H':// H (1101) + case 'h': + this->set(bit_offsets[i], 13); + break; + case 'V':// V (1110) + case 'v': + this->set(bit_offsets[i], 14); + break; + case 'N':// N (1111) + case 'n': + this->set(bit_offsets[i], 15); + break; #if DEBUG - default: - throw std::invalid_argument("fourbit_byte::set(char *) invalid value\n"); - break; + default: + throw std::invalid_argument("fourbit_byte::set(char *) invalid value\n"); + break; #endif //DEBUG } } @@ -227,9 +226,9 @@ void fourbit_byte::set(char* buffer) char *fourbit_byte::get(unsigned char length) { #if DEBUG - if(length > 2) { - throw std::invalid_argument("four_byte::get(unsigned char length) -> out of bound: " + std::to_string(length)+ "\n"); - } + if(length > 2) { + throw std::invalid_argument("four_byte::get(unsigned char length) -> out of bound: " + std::to_string(length) + "\n"); + } #endif //DEBUG char *seq = new char[length + 1]; diff --git a/src/fuse.cpp b/src/fuse.cpp index 3c5fcbe9..b81801e2 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -493,7 +493,7 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) fi->f->load(fname); fi->cache = fi->f->init_ffs2f(fi->padding, true);// allow mixed case } else { - std::string basename = basename_cpp(std::string(argv[mount_target_arg])); + std::string basename = basename_cpp(std::string(argv[mount_target_arg])); //std::string basename = std::filesystem::path(std::string(argv[mount_target_arg])).filename(); fi->u2b = new ucsc2bit(basename);// useses basename as prefix for filenames to mount: hg19.2bit -> hg19.2bit.fa diff --git a/src/lsfastafs.cpp b/src/lsfastafs.cpp index 9fd654a4..0a9a33c1 100644 --- a/src/lsfastafs.cpp +++ b/src/lsfastafs.cpp @@ -59,11 +59,11 @@ std::unordered_multimap > get_f strstr(mount_dir, arg) != NULL)) { std::string fn = std::string(mount_dir); - + std::string basename = basename_cpp(fn); //std::string basename = std::filesystem::path(fn).filename(); //std::cout << "basename: " << basename << "\n"; - + std::string dict_fn = std::string(mount_dir) + "/" + basename + ".dict"; if(getxattr(mount_dir, FASTAFS_FILE_XATTR_NAME.c_str(), xattr_fastafs_file, 255) != -1 diff --git a/src/twobit_byte.cpp b/src/twobit_byte.cpp index 0a6ac8ca..8878a533 100644 --- a/src/twobit_byte.cpp +++ b/src/twobit_byte.cpp @@ -48,18 +48,18 @@ void twobit_byte::set(unsigned char bit_offset, unsigned char nucleotide) // ??????00 // 11?? ~(3 << bit_offset) // data ???????? - this->data = (unsigned char)(this->data & ~( (2+1) << bit_offset)); + this->data = (unsigned char)(this->data & ~((2 + 1) << bit_offset)); break; case 1://NUCLEOTIDE_C (01) - this->data = (unsigned char)(this->data & ~( (2 ) << bit_offset)); - this->data = (unsigned char)(this->data | ( ( 1) << bit_offset)); + this->data = (unsigned char)(this->data & ~((2) << bit_offset)); + this->data = (unsigned char)(this->data | ((1) << bit_offset)); break; case 2://NUCLEOTIDE_A (10) - this->data = (unsigned char)(this->data & ~( ( 1) << bit_offset)); - this->data = (unsigned char)(this->data | ( (2 ) << bit_offset)); + this->data = (unsigned char)(this->data & ~((1) << bit_offset)); + this->data = (unsigned char)(this->data | ((2) << bit_offset)); break; case 3://NUCLEOTIDE_G (11) - this->data = (unsigned char)(this->data | ( (2+1) << bit_offset)); + this->data = (unsigned char)(this->data | ((2 + 1) << bit_offset)); break; #if DEBUG default: @@ -112,9 +112,9 @@ void twobit_byte::set(char* buffer) char *twobit_byte::get(unsigned char length) { #if DEBUG - if(length > 4) { - throw std::invalid_argument("twobit_byte::get(unsigned char length) -> out of bound: " + std::to_string(length) + "\n"); - } + if(length > 4) { + throw std::invalid_argument("twobit_byte::get(unsigned char length) -> out of bound: " + std::to_string(length) + "\n"); + } #endif //DEBUG char *seq = new char[length + 1]; diff --git a/src/utils.cpp b/src/utils.cpp index 446ad08c..be424498 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -175,29 +175,31 @@ bool is_fasta_file(char *filename) // https://www.systutorials.com/241216/how-to-get-the-directory-path-and-file-name-from-a-absolute-path-in-c-on-linux/ // https://stackoverflow.com/questions/38456127/what-is-the-value-of-cplusplus-for-c17 - THEN use std::filesystem::path(filename).filename(); -std::string basename_cpp(std::string fn) { - char* ts = strdup(fn.c_str()); - - //char* dir = dirname(ts1); - char* filename = basename(ts); - //std::string filenamepp = std::string(filename); - - //printf("basename: [%s]\n", filename); - //std::cout << "basenamepp: |" << filenamepp << "|\n"; - - return std::string(filename); +std::string basename_cpp(std::string fn) +{ + char* ts = strdup(fn.c_str()); + + //char* dir = dirname(ts1); + char* filename = basename(ts); + //std::string filenamepp = std::string(filename); + + //printf("basename: [%s]\n", filename); + //std::cout << "basenamepp: |" << filenamepp << "|\n"; + + return std::string(filename); } // https://www.linuxquestions.org/questions/programming-9/how-to-get-the-full-path-of-a-file-in-c-841046/ // https://stackoverflow.com/questions/38456127/what-is-the-value-of-cplusplus-for-c17 - THEN use std::filesystem::canonical(filename) -std::string realpath_cpp(std::string fn) { - //std::string out = "asd"; - char *path = realpath(fn.c_str(), NULL); - //printf("realpath: [%s]\n", path); - - //std::string realpathpp = std::string(path); - //std::cout << "realpath: |" << realpathpp << "|\n"; - - return std::string(path); +std::string realpath_cpp(std::string fn) +{ + //std::string out = "asd"; + char *path = realpath(fn.c_str(), NULL); + //printf("realpath: [%s]\n", path); + + //std::string realpathpp = std::string(path); + //std::cout << "realpath: |" << realpathpp << "|\n"; + + return std::string(path); } diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 8cb85909..dccf451b 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -28,7 +28,7 @@ BOOST_AUTO_TEST_CASE(test_equality_fourbit_byte) b.set(4, 0);// A => 0 b.set(0, 0); BOOST_CHECK_EQUAL(b.data, 0); - + seq1 = b.get(1); seq2 = b.get(2); seq = b.get(); @@ -54,9 +54,9 @@ BOOST_AUTO_TEST_CASE(test_equality_fourbit_byte) delete[] seq1; delete[] seq2; - - // GT: 0010 0011 - b.set(4, 2); // G + + // GT: 0010 0011 + b.set(4, 2); // G b.set(0, 3); // T BOOST_CHECK_EQUAL(b.data, 35); @@ -72,7 +72,7 @@ BOOST_AUTO_TEST_CASE(test_equality_fourbit_byte) delete[] seq2; - // set to UR (0100 0101) + // set to UR (0100 0101) b.set(4, 4); b.set(0, 5); BOOST_CHECK_EQUAL(b.data, 69); @@ -142,7 +142,7 @@ BOOST_AUTO_TEST_CASE(Test_size) BOOST_AUTO_TEST_CASE(test_cache) { size_t written = fasta_to_fourbit_fastafs("test/data/test_004.fa", "tmp/test_004.fastafs"); - + static std::string reference = // GENERIC-HEADER "\x0F\x0A\x46\x53"s// [0, 3] @@ -176,8 +176,8 @@ BOOST_AUTO_TEST_CASE(test_cache) "\x00" // [120] no metadata fields [padding will come soon?] ; - BOOST_CHECK_EQUAL(written, 121); - + BOOST_CHECK_EQUAL(written, 121); + //BOOST_CHECK(output.compare(uppercase) == 0 or output.compare(mixedcase) == 0); std::ifstream file("tmp/test_004.fastafs", std::ios::in | std::ios::binary | std::ios::ate); BOOST_REQUIRE(file.is_open()); @@ -193,10 +193,10 @@ BOOST_AUTO_TEST_CASE(test_cache) for(unsigned int i = 0; i < size; i++) { BOOST_CHECK_EQUAL(buffer[i], reference[i]); - if(reference[i] != buffer[i]) { - printf("comparing char %i\n", i); - printf(" ** mismatch [%d] [ref] %d != [buf] %d (%c x %02hhX)\n", i, reference[i], buffer[i], buffer[i], buffer[i]); - } + if(reference[i] != buffer[i]) { + printf("comparing char %i\n", i); + printf(" ** mismatch [%d] [ref] %d != [buf] %d (%c x %02hhX)\n", i, reference[i], buffer[i], buffer[i], buffer[i]); + } } From b5f6004e928ea3d8607027dd72367d6274c1f086 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 3 Dec 2019 16:40:39 +0100 Subject: [PATCH 023/119] cleaner --- CMakeLists.txt | 4 + include/fastafs.hpp | 70 ++-------------- include/flags.hpp | 121 +++++++++++++++++++++++++++ src/fastafs.cpp | 16 ---- src/flags.cpp | 61 ++++++++++++++ test/CMakeLists.txt | 17 ++-- test/flags/test_flags.cpp | 76 +++++++++++++++++ test/fourbit_byte/test_four_byte.cpp | 68 --------------- 8 files changed, 277 insertions(+), 156 deletions(-) create mode 100644 include/flags.hpp create mode 100644 src/flags.cpp create mode 100644 test/flags/test_flags.cpp delete mode 100644 test/fourbit_byte/test_four_byte.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 196814ab..9f8aa6cf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,6 +102,7 @@ add_executable(fastafs src/main.cpp src/fasta_to_twobit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp + src/flags.cpp src/fastafs.cpp src/ucsc2bit.cpp src/twobit_byte.cpp @@ -117,6 +118,7 @@ add_executable(mount.fastafs src/main_mount.cpp src/fasta_to_twobit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp + src/flags.cpp src/fastafs.cpp src/ucsc2bit.cpp src/twobit_byte.cpp @@ -130,6 +132,7 @@ set_target_properties(mount.fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD add_library(libfastafs SHARED src/fasta_to_twobit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp + src/flags.cpp src/fastafs.cpp src/ucsc2bit.cpp src/twobit_byte.cpp @@ -166,6 +169,7 @@ add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-) add_test(test_cache_twobit "${BUILD_TEST_DIR}/test_cache_twobit") add_test(test_view "${BUILD_TEST_DIR}/test_view") +add_test(test_flags "${BUILD_TEST_DIR}/test_flags") add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs") add_test(test_fastafs_as_ucsc2bit "${BUILD_TEST_DIR}/test_fastafs_as_ucsc2bit") add_test(test_ucsc2bit_to_fastafs "${BUILD_TEST_DIR}/test_ucsc2bit_to_fastafs") diff --git a/include/fastafs.hpp b/include/fastafs.hpp index e1ea847d..0e04b007 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -1,78 +1,18 @@ -#include - -#include -#include - -#include "utils.hpp" #ifndef FASTAFS_HPP #define FASTAFS_HPP +#include -class fastafs_flags -{ -private: - unsigned char bits[2];// 00000000 00000000 - -public: - void set(char*); -}; - -class fastafs_sequence_flags -{ -private: - unsigned char bits[2];// 00000000 00000000 - - // set by flag - void set_flag(unsigned char, bool);// counting flag from bit 0(!) - - -public: - void set(char*); - - bool is_dna(); // alphabet: 'ACTG' + 'N' - bool is_rna(); // alphabet: 'ACUG' + 'N' - bool is_iupec_nucleotide(); // alphabet: 'ACGTURYKMSWBDHVN' + '-' - - bool is_complete(); - bool is_incomplete() - { - return !this->is_complete(); - }; - - bool is_linear(); - bool is_circular() - { - return !this->is_linear(); - }; - - bool get_flag(unsigned char);// counting flag position from bit 0 - - - // set by entity - void set_dna(); - void set_rna(); - void set_iupec_nucleotide(); - - void set_complete(); - void set_incomplete(); - - void set_linear(); - void set_circular(); - -}; - - - - - - - +#include +#include +#include "utils.hpp" +#include "flags.hpp" struct ffs2f_init_seq { diff --git a/include/flags.hpp b/include/flags.hpp new file mode 100644 index 00000000..99d71ea8 --- /dev/null +++ b/include/flags.hpp @@ -0,0 +1,121 @@ + + +#ifndef FLAGS_HPP +#define FLAGS_HPP + +#include + +constexpr unsigned char mask0{ 0b0000'0001 }; // represents bit 0 +constexpr unsigned char mask1{ 0b0000'0010 }; // represents bit 1 +constexpr unsigned char mask2{ 0b0000'0100 }; // represents bit 2 +constexpr unsigned char mask3{ 0b0000'1000 }; // represents bit 3 +constexpr unsigned char mask4{ 0b0001'0000 }; // represents bit 4 +constexpr unsigned char mask5{ 0b0010'0000 }; // represents bit 5 +constexpr unsigned char mask6{ 0b0100'0000 }; // represents bit 6 +constexpr unsigned char mask7{ 0b1000'0000 }; // represents bit 7 + + +constexpr std::array bitmasks = { + + 0b1000'0000, // represents bit 7 + 0b0100'0000, // represents bit 6 + 0b0010'0000, // represents bit 5 + 0b0001'0000, // represents bit 4 + 0b0000'1000, // represents bit 3 + 0b0000'0100, // represents bit 2 + 0b0000'0010, // represents bit 1 + 0b0000'0001, // represents bit 0 + + 0b1000'0000, // represents bit 7 + 0b0100'0000, // represents bit 6 + 0b0010'0000, // represents bit 5 + 0b0001'0000, // represents bit 4 + 0b0000'1000, // represents bit 3 + 0b0000'0100, // represents bit 2 + 0b0000'0010, // represents bit 1 + 0b0000'0001, // represents bit 0 + + + +}; + + +//#include "utils.hpp" + + +class twobit_flag { + protected: + unsigned char bits[2];// 00000000 00000000 + + // set by flag + void set_flag(unsigned char, bool);// counting flag from bit 0(!) + bool get_flag(unsigned char); + + public: + void set(char *); +}; + + +class fastafs_flags : public twobit_flag +{ + public: + bool is_complete(); + bool is_incomplete() + { + return !this->is_complete(); + }; + + void set_complete(); + void set_incomplete(); +}; + + + +class fastafs_sequence_flags +{ +private: + unsigned char bits[2];// 00000000 00000000 + + // set by flag + void set_flag(unsigned char, bool);// counting flag from bit 0(!) + bool get_flag(unsigned char);// counting flag position from bit 0 + + +public: + void set(char*); + + bool is_dna(); // alphabet: 'ACTG' + 'N' + bool is_rna(); // alphabet: 'ACUG' + 'N' + bool is_iupec_nucleotide(); // alphabet: 'ACGTURYKMSWBDHVN' + '-' + + bool is_complete(); + bool is_incomplete() + { + return !this->is_complete(); + }; + + bool is_linear(); + bool is_circular() + { + return !this->is_linear(); + }; + + + + // set by entity + void set_dna(); + void set_rna(); + void set_iupec_nucleotide(); + + void set_complete(); + void set_incomplete(); + + void set_linear(); + void set_circular(); + +}; + + + + +#endif diff --git a/src/fastafs.cpp b/src/fastafs.cpp index c1ee56f7..27e96744 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -35,22 +35,6 @@ - -void fastafs_flags::set(char *data) -{ - this->bits[0] = data[0]; - this->bits[1] = data[1]; -} - -void fastafs_sequence_flags::set(char *data) -{ - this->bits[0] = data[0]; - this->bits[1] = data[1]; -} - - - - static const std::string dict_sq = "@SQ\tSN:"; static const std::string dict_ln = "\tLN:"; static const std::string dict_m5 = "\tM5:"; diff --git a/src/flags.cpp b/src/flags.cpp new file mode 100644 index 00000000..ffb73bdc --- /dev/null +++ b/src/flags.cpp @@ -0,0 +1,61 @@ + + + +/* +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +*/ + +//#include "config.hpp" + +#include + +#include "flags.hpp" + + + + +void twobit_flag::set(char *data) +{ + this->bits[0] = data[0]; + this->bits[1] = data[1]; +} + + + +// https://www.learncpp.com/cpp-tutorial/bit-manipulation-with-bitwise-operators-and-bit-masks/ +bool twobit_flag::get_flag(unsigned char bit) { +#if DEBUG + if(bit >= 16) { + throw std::runtime_error("twobit_flag::get_flag = out of bound: " + std::to_string(bit) + "\n"); + } +#endif //DEBUG + + return (this->bits[bit / 8] & bitmasks[bit]); +} + + + +// https://www.learncpp.com/cpp-tutorial/bit-manipulation-with-bitwise-operators-and-bit-masks/ +void twobit_flag::set_flag(unsigned char, bool enable) { + if(enable) { // + } + else { + } +} + + + + +bool fastafs_flags::is_complete() { + return this->get_flag(0); +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5144c37f..e9c23267 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,13 +20,14 @@ set(BUILD_TEST_DIR "${BUILD_DIR}/test") add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_fourbit_byte fourbit_byte/test_fourbit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_cache_twobit cache/test_cache_twobit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_cache_fourbit cache/test_cache_fourbit.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_cache_twobit cache/test_cache_twobit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_cache_fourbit cache/test_cache_fourbit.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_flags flags/test_flags.cpp ../src/flags.cpp ../src/utils.cpp) +add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_utils utils/test_utils.cpp ../src/utils.cpp) #add_executable(test_tree tree/test_tree.cpp) @@ -39,6 +40,8 @@ set_target_properties(test_cache_fourbit PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_view PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_flags + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_fastafs_as_ucsc2bit diff --git a/test/flags/test_flags.cpp b/test/flags/test_flags.cpp new file mode 100644 index 00000000..0953683b --- /dev/null +++ b/test/flags/test_flags.cpp @@ -0,0 +1,76 @@ +#define BOOST_TEST_MODULE flags + +#include + +#include "config.hpp" + +#include "flags.hpp" + + +//#include +//#include + + +BOOST_AUTO_TEST_SUITE(Testing) + + +BOOST_AUTO_TEST_CASE(test_fastafs_flags) +{ + fastafs_flags f; + + char buffer[2 + 1]; + buffer[2] = '\0'; + + // test: 00000000 00000000 + buffer[0] = '\x0'; + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + + + // test: 10000000 00000000 + buffer[0] = '\x80'; // worked with writing to file and checking with `xxd -b file` ~ this is binary equivalent to 10000000 + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + + + // test: 11111111 00000000 + buffer[0] = '\xFF'; + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + + // test: 00000001 00000000 + buffer[0] = '\x01'; + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + + + + // re-test: 00000000 00000000 + buffer[0] = '\x0'; + buffer[1] = '\x0'; + f.set(buffer); + + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + + //f.set_complete(); + //BOOST_CHECK_EQUAL(f.is_complete(), true); + //BOOST_CHECK_EQUAL(f.is_incomplete(), false); +} + + + + +BOOST_AUTO_TEST_SUITE_END() diff --git a/test/fourbit_byte/test_four_byte.cpp b/test/fourbit_byte/test_four_byte.cpp deleted file mode 100644 index eb099cfb..00000000 --- a/test/fourbit_byte/test_four_byte.cpp +++ /dev/null @@ -1,68 +0,0 @@ -#define BOOST_TEST_MODULE fourbit_byte - -#include - -#include "config.hpp" - -#include "fourbit_byte.hpp" - - -BOOST_AUTO_TEST_SUITE(Testing) - - -BOOST_AUTO_TEST_CASE(test_twobit_conversions) -{ - char seq[5]; - seq[4] = '\0'; - twobit_byte t; - seq[0] = 'A'; - seq[1] = 'A'; - seq[2] = 'A'; - seq[3] = 'A'; - t.set(seq);//10101010 = 170 - printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); - BOOST_CHECK_EQUAL(t.data, 170); - - seq[0] = 'T'; - seq[1] = 'A'; - seq[2] = 'A'; - seq[3] = 'A'; - t.set(seq); - printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); - BOOST_CHECK_EQUAL(t.data, 42); - seq[0] = 'A'; - seq[1] = 'C'; - seq[2] = 'T'; - seq[3] = 'G'; - t.set(seq); - printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); - BOOST_CHECK_EQUAL(t.data, 147); - seq[0] = 'N'; - seq[1] = 'C'; - seq[2] = 'T'; - seq[3] = 'N'; - t.set(seq);//00 01 00 00 - printf("[%s] -> %i ~ %u -> [%s]\n", seq, (signed char) t.data, (unsigned char) t.data, t.get()); - BOOST_CHECK_EQUAL(t.data, 16); -} - -BOOST_AUTO_TEST_CASE(test_twobit_static_offset_conversion_test) -{ - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(0), 6); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(1), 4); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(2), 2); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(3), 0); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(4), 6); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(5), 4); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(6), 2); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(7), 0); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(8), 6); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(9), 4); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(10), 2); - BOOST_CHECK_EQUAL(twobit_byte::iterator_to_offset(11), 0); -} - - - - -BOOST_AUTO_TEST_SUITE_END() From 48887a2bdfc6f8845661e40454a3a066befb738f Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 3 Dec 2019 16:47:23 +0100 Subject: [PATCH 024/119] full testing of setting that byte --- src/flags.cpp | 25 +++++++++++++++++++++---- test/flags/test_flags.cpp | 22 +++++++++++++++++++--- 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/flags.cpp b/src/flags.cpp index ffb73bdc..b455ef4a 100644 --- a/src/flags.cpp +++ b/src/flags.cpp @@ -35,9 +35,9 @@ void twobit_flag::set(char *data) // https://www.learncpp.com/cpp-tutorial/bit-manipulation-with-bitwise-operators-and-bit-masks/ bool twobit_flag::get_flag(unsigned char bit) { #if DEBUG - if(bit >= 16) { - throw std::runtime_error("twobit_flag::get_flag = out of bound: " + std::to_string(bit) + "\n"); - } + if(bit >= 16) { + throw std::runtime_error("twobit_flag::get_flag = out of bound: " + std::to_string(bit) + "\n"); + } #endif //DEBUG return (this->bits[bit / 8] & bitmasks[bit]); @@ -46,10 +46,17 @@ bool twobit_flag::get_flag(unsigned char bit) { // https://www.learncpp.com/cpp-tutorial/bit-manipulation-with-bitwise-operators-and-bit-masks/ -void twobit_flag::set_flag(unsigned char, bool enable) { +void twobit_flag::set_flag(unsigned char bit, bool enable) { + if(bit >= 16) { + throw std::runtime_error("twobit_flag::set_flag = out of bound: " + std::to_string(bit) + "\n"); + } + + if(enable) { // + this->bits[bit / 8] |= bitmasks[bit]; } else { + this->bits[bit / 8] &= ~bitmasks[bit]; } } @@ -59,3 +66,13 @@ void twobit_flag::set_flag(unsigned char, bool enable) { bool fastafs_flags::is_complete() { return this->get_flag(0); } + +void fastafs_flags::set_complete() { + this->set_flag(0, true); +} + +void fastafs_flags::set_incomplete() { + this->set_flag(0, false); +} + + diff --git a/test/flags/test_flags.cpp b/test/flags/test_flags.cpp index 0953683b..09e6d2f1 100644 --- a/test/flags/test_flags.cpp +++ b/test/flags/test_flags.cpp @@ -65,9 +65,25 @@ BOOST_AUTO_TEST_CASE(test_fastafs_flags) BOOST_CHECK_EQUAL(f.is_complete(), false); BOOST_CHECK_EQUAL(f.is_incomplete(), true); - //f.set_complete(); - //BOOST_CHECK_EQUAL(f.is_complete(), true); - //BOOST_CHECK_EQUAL(f.is_incomplete(), false); + f.set_complete(); + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + f.set_complete(); + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + f.set_complete(); + BOOST_CHECK_EQUAL(f.is_complete(), true); + BOOST_CHECK_EQUAL(f.is_incomplete(), false); + + f.set_incomplete(); + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + f.set_incomplete(); + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); + f.set_incomplete(); + BOOST_CHECK_EQUAL(f.is_complete(), false); + BOOST_CHECK_EQUAL(f.is_incomplete(), true); } From 20acae2a11cd44b79210155101898f5f575fb7cb Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 3 Dec 2019 16:50:54 +0100 Subject: [PATCH 025/119] free of warnings :) --- include/flags.hpp | 48 +++++++++++++++++------------------------------ src/flags.cpp | 23 ++++------------------- 2 files changed, 21 insertions(+), 50 deletions(-) diff --git a/include/flags.hpp b/include/flags.hpp index 99d71ea8..7b8302ef 100644 --- a/include/flags.hpp +++ b/include/flags.hpp @@ -5,38 +5,24 @@ #include -constexpr unsigned char mask0{ 0b0000'0001 }; // represents bit 0 -constexpr unsigned char mask1{ 0b0000'0010 }; // represents bit 1 -constexpr unsigned char mask2{ 0b0000'0100 }; // represents bit 2 -constexpr unsigned char mask3{ 0b0000'1000 }; // represents bit 3 -constexpr unsigned char mask4{ 0b0001'0000 }; // represents bit 4 -constexpr unsigned char mask5{ 0b0010'0000 }; // represents bit 5 -constexpr unsigned char mask6{ 0b0100'0000 }; // represents bit 6 -constexpr unsigned char mask7{ 0b1000'0000 }; // represents bit 7 - - constexpr std::array bitmasks = { - - 0b1000'0000, // represents bit 7 - 0b0100'0000, // represents bit 6 - 0b0010'0000, // represents bit 5 - 0b0001'0000, // represents bit 4 - 0b0000'1000, // represents bit 3 - 0b0000'0100, // represents bit 2 - 0b0000'0010, // represents bit 1 - 0b0000'0001, // represents bit 0 - - 0b1000'0000, // represents bit 7 - 0b0100'0000, // represents bit 6 - 0b0010'0000, // represents bit 5 - 0b0001'0000, // represents bit 4 - 0b0000'1000, // represents bit 3 - 0b0000'0100, // represents bit 2 - 0b0000'0010, // represents bit 1 - 0b0000'0001, // represents bit 0 - - - + 0b1000'0000, // represents bit 7 + 0b0100'0000, // represents bit 6 + 0b0010'0000, // represents bit 5 + 0b0001'0000, // represents bit 4 + 0b0000'1000, // represents bit 3 + 0b0000'0100, // represents bit 2 + 0b0000'0010, // represents bit 1 + 0b0000'0001, // represents bit 0 + + 0b1000'0000, // represents bit 7 + 0b0100'0000, // represents bit 6 + 0b0010'0000, // represents bit 5 + 0b0001'0000, // represents bit 4 + 0b0000'1000, // represents bit 3 + 0b0000'0100, // represents bit 2 + 0b0000'0010, // represents bit 1 + 0b0000'0001, // represents bit 0 }; diff --git a/src/flags.cpp b/src/flags.cpp index b455ef4a..8511785c 100644 --- a/src/flags.cpp +++ b/src/flags.cpp @@ -1,22 +1,5 @@ - -/* -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -*/ - -//#include "config.hpp" - #include #include "flags.hpp" @@ -53,10 +36,12 @@ void twobit_flag::set_flag(unsigned char bit, bool enable) { if(enable) { // - this->bits[bit / 8] |= bitmasks[bit]; + //this->bits[bit / 8] |= bitmasks[bit]; + this->bits[bit / 8] = (unsigned char) (this->bits[bit / 8] | bitmasks[bit]); } else { - this->bits[bit / 8] &= ~bitmasks[bit]; + //this->bits[bit / 8] &= ~bitmasks[bit]; + this->bits[bit / 8] = (unsigned char) (this->bits[bit / 8] & ~bitmasks[bit]); } } From b6526a7b2d042646758366a3b8b80a17964e4237 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 3 Dec 2019 17:15:22 +0100 Subject: [PATCH 026/119] save flag system tests --- include/flags.hpp | 29 +++++++------ src/flags.cpp | 72 ++++++++++++++++++++++++++++++-- test/flags/test_flags.cpp | 88 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 173 insertions(+), 16 deletions(-) diff --git a/include/flags.hpp b/include/flags.hpp index 7b8302ef..898fdf34 100644 --- a/include/flags.hpp +++ b/include/flags.hpp @@ -5,6 +5,18 @@ #include + +const unsigned char FASTAFS_BITFLAG_COMPLETE = 0; + +const unsigned char FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1 = 0; +const unsigned char FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2 = 1; +// const unsigned char FASTAFS_SEQUENCE_BITFLAG_???? = 2 ; // is reserved +const unsigned char FASTAFS_SEQUENCE_BITFLAG_COMPLETE = 3; +const unsigned char FASTAFS_SEQUENCE_BITFLAG_CIRCULAR = 4; + + + + constexpr std::array bitmasks = { 0b1000'0000, // represents bit 7 0b0100'0000, // represents bit 6 @@ -57,19 +69,10 @@ class fastafs_flags : public twobit_flag -class fastafs_sequence_flags +class fastafs_sequence_flags : public twobit_flag { -private: - unsigned char bits[2];// 00000000 00000000 - - // set by flag - void set_flag(unsigned char, bool);// counting flag from bit 0(!) - bool get_flag(unsigned char);// counting flag position from bit 0 - public: - void set(char*); - bool is_dna(); // alphabet: 'ACTG' + 'N' bool is_rna(); // alphabet: 'ACUG' + 'N' bool is_iupec_nucleotide(); // alphabet: 'ACGTURYKMSWBDHVN' + '-' @@ -80,10 +83,10 @@ class fastafs_sequence_flags return !this->is_complete(); }; - bool is_linear(); - bool is_circular() + bool is_circular(); + bool is_linear() { - return !this->is_linear(); + return !this->is_circular(); }; diff --git a/src/flags.cpp b/src/flags.cpp index 8511785c..f3a98a80 100644 --- a/src/flags.cpp +++ b/src/flags.cpp @@ -49,15 +49,81 @@ void twobit_flag::set_flag(unsigned char bit, bool enable) { bool fastafs_flags::is_complete() { - return this->get_flag(0); + return this->get_flag(FASTAFS_BITFLAG_COMPLETE); } void fastafs_flags::set_complete() { - this->set_flag(0, true); + this->set_flag(FASTAFS_BITFLAG_COMPLETE, true); } void fastafs_flags::set_incomplete() { - this->set_flag(0, false); + this->set_flag(FASTAFS_BITFLAG_COMPLETE, false); +} + + + + +// alphabet: 'ACTG' + 'N' +bool fastafs_sequence_flags::is_dna() { + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == false && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == false); +} + +// alphabet: 'ACUG' + 'N' +bool fastafs_sequence_flags::is_rna() { + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == true && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == false); +} + +// alphabet: 'ACGTURYKMSWBDHVN' + '-' +bool fastafs_sequence_flags::is_iupec_nucleotide() { + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == false && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == true); +} + +bool fastafs_sequence_flags::is_complete() { + return this->get_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE); +} + +bool fastafs_sequence_flags::is_circular() { + return this->get_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR); +} + + + + +void fastafs_sequence_flags::set_dna() { + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, false); // 0,0 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, false); +} + +void fastafs_sequence_flags::set_rna() { + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, true); // 1,0 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, false); +} + +void fastafs_sequence_flags::set_iupec_nucleotide() { + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, false); // 0,1 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, true); +} + +void fastafs_sequence_flags::set_complete() { + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE, true); +} + +void fastafs_sequence_flags::set_incomplete() { + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE, false); +} + +void fastafs_sequence_flags::set_linear() { + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR, false); +} + +void fastafs_sequence_flags::set_circular() { + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR, true); } diff --git a/test/flags/test_flags.cpp b/test/flags/test_flags.cpp index 09e6d2f1..418058b5 100644 --- a/test/flags/test_flags.cpp +++ b/test/flags/test_flags.cpp @@ -87,6 +87,94 @@ BOOST_AUTO_TEST_CASE(test_fastafs_flags) } +BOOST_AUTO_TEST_CASE(test_fastafs_sequence_flags) +{ + fastafs_sequence_flags fs; + + fs.set_dna(); + fs.set_rna(); + fs.set_iupec_nucleotide(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), true); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), false); + + + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + fs.set_rna(); + fs.set_iupec_nucleotide(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), true); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), false); + + + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + fs.set_dna(); + fs.set_iupec_nucleotide(); + fs.set_rna(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), true); + + + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + fs.set_dna(); + fs.set_iupec_nucleotide(); + fs.set_rna(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), true); + + + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + fs.set_iupec_nucleotide(); + fs.set_rna(); + fs.set_dna(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); + BOOST_CHECK_EQUAL(fs.is_dna(), true); + BOOST_CHECK_EQUAL(fs.is_rna(), false); + + + fs.set_linear(); + BOOST_CHECK_EQUAL(fs.is_linear(), true); + BOOST_CHECK_EQUAL(fs.is_circular(), false); + + fs.set_circular(); + fs.set_circular(); + fs.set_linear(); + BOOST_CHECK_EQUAL(fs.is_linear(), true); + BOOST_CHECK_EQUAL(fs.is_circular(), false); + + fs.set_linear(); + fs.set_linear(); + fs.set_circular(); + BOOST_CHECK_EQUAL(fs.is_linear(), false); + BOOST_CHECK_EQUAL(fs.is_circular(), true); + + + fs.set_complete(); + BOOST_CHECK_EQUAL(fs.is_complete(), true); + BOOST_CHECK_EQUAL(fs.is_incomplete(), false); + + fs.set_incomplete(); + fs.set_incomplete(); + fs.set_complete(); + BOOST_CHECK_EQUAL(fs.is_complete(), true); + BOOST_CHECK_EQUAL(fs.is_incomplete(), false); + + fs.set_complete(); + fs.set_complete(); + fs.set_incomplete(); + BOOST_CHECK_EQUAL(fs.is_complete(), false); + BOOST_CHECK_EQUAL(fs.is_incomplete(), true); +} BOOST_AUTO_TEST_SUITE_END() From cff847fb262dab5dddb22334c6f7fea2702650f1 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 18 Dec 2019 13:56:28 +0100 Subject: [PATCH 027/119] using flag objects for setting flags --- include/flags.hpp | 5 ++++- src/fasta_to_twobit_fastafs.cpp | 25 +++++++++++++++++++++---- src/flags.cpp | 12 ++++++++++++ src/ucsc2bit_to_fastafs.cpp | 24 ++++++++++++++++++++---- test/cache/test_cache_twobit.cpp | 16 ++++++++-------- test/flags/test_flags.cpp | 20 ++++++++++++++++++++ 6 files changed, 85 insertions(+), 17 deletions(-) diff --git a/include/flags.hpp b/include/flags.hpp index 898fdf34..b76bb0d2 100644 --- a/include/flags.hpp +++ b/include/flags.hpp @@ -43,7 +43,9 @@ constexpr std::array bitmasks = { class twobit_flag { protected: - unsigned char bits[2];// 00000000 00000000 + twobit_flag(); + + std::array bits;// 00000000 00000000 // set by flag void set_flag(unsigned char, bool);// counting flag from bit 0(!) @@ -51,6 +53,7 @@ class twobit_flag { public: void set(char *); + std::array &get_bits(void); // get bit 0 or bit 1 }; diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index ead6ce0f..67efa253 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -4,6 +4,7 @@ #include "config.hpp" #include "fasta_to_twobit_fastafs.hpp" +#include "flags.hpp" #include "utils.hpp" @@ -119,6 +120,9 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f std::vector index; fasta_seq_header_twobit_conversion_data* s; + fastafs_flags ffsf; + ffsf.set_incomplete(); + // @todo use ifstream and ofstream argument types std::string line; std::ifstream fh_fasta(fasta_file.c_str(), std::ios :: in | std::ios :: binary); @@ -127,7 +131,11 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f if(fh_fasta.is_open() and fh_fastafs.is_open()) { fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; - fh_fastafs << "\x00\x00"s;// the flag for now, set to INCOMPLETE as writing is in progress + + // the flag for now, set to INCOMPLETE as writing is in progress || spacer that will be overwritten later + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + fh_fastafs << "\x00\x00\x00\x00"s;// position of metedata ~ unknown YET // iterate until first sequence is found, ensuring we won't write to uninitialized sequences @@ -280,8 +288,13 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f for(size_t i = 0; i < index.size(); i++) { s = index[i]; - // flag - fh_fastafs << "\x00\x08"s;// 00001000 (DNA + completed-with-checksum) | this probably has to be mirrored as last and first bit are swapped + // set and write flag + fastafs_sequence_flags fsf; + fsf.set_linear(); + fsf.set_dna(); + fsf.set_complete(); + fh_fastafs << fsf.get_bits()[0]; + fh_fastafs << fsf.get_bits()[1]; // name unsigned char name_size = (unsigned char) s->name.size(); @@ -297,7 +310,11 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f // update header: set to updated fh_fastafs.seekp(8, std::ios::beg); - fh_fastafs << "\x00\x01"s; // updated flag + + ffsf.set_complete(); + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + uint_to_fourbytes(buffer, index_file_position);//position of header fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); diff --git a/src/flags.cpp b/src/flags.cpp index f3a98a80..4b117000 100644 --- a/src/flags.cpp +++ b/src/flags.cpp @@ -6,6 +6,13 @@ +twobit_flag::twobit_flag() { + // ensure all bits are set, this prevents unexpected or undefined behaviour + this->bits[0] = '\0'; + this->bits[1] = '\0'; +} + + void twobit_flag::set(char *data) { @@ -46,6 +53,11 @@ void twobit_flag::set_flag(unsigned char bit, bool enable) { } +std::array &twobit_flag::get_bits(void) { + return this->bits; +} + + bool fastafs_flags::is_complete() { diff --git a/src/ucsc2bit_to_fastafs.cpp b/src/ucsc2bit_to_fastafs.cpp index de1f0e5b..93c2b43b 100644 --- a/src/ucsc2bit_to_fastafs.cpp +++ b/src/ucsc2bit_to_fastafs.cpp @@ -4,6 +4,7 @@ #include "config.hpp" #include "ucsc2bit_to_fastafs.hpp" +#include "flags.hpp" #include "utils.hpp" @@ -29,6 +30,9 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) fastafs fs_new = fastafs(""); uint32_t i, j, n; + fastafs_flags ffsf; + ffsf.set_incomplete(); + ucsc2bit_seq_header *s; ucsc2bit_seq_header_conversion_data *t; @@ -39,7 +43,11 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) // Write header fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; - fh_fastafs << "\x00\x00"s;// the flag for now, set to INCOMPLETE as writing is in progress + + // the flag for now, set to INCOMPLETE as writing is in progress || spacer that will be overwritten later + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + fh_fastafs << "\x00\x00\x00\x00"s;// position of metedata ~ unknown YET // Read UCSC2bit header (n seq) @@ -222,8 +230,13 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) s = data[i]; t = data2[i]; - // flag - fh_fastafs << "\x00\x08"s; + // set and write flag + fastafs_sequence_flags fsf; + fsf.set_linear(); + fsf.set_dna(); + fsf.set_complete(); + fh_fastafs << fsf.get_bits()[0]; + fh_fastafs << fsf.get_bits()[1]; // name fh_fastafs.write((char *) &s->name_size, (size_t) 1); // name size @@ -242,7 +255,10 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) // update header: set to updated fh_fastafs.seekp(8, std::ios::beg); - fh_fastafs << "\x00\x01"s; // updated flag + ffsf.set_complete(); + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + uint_to_fourbytes(buffer, index_file_position);//position of header fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); } diff --git a/test/cache/test_cache_twobit.cpp b/test/cache/test_cache_twobit.cpp index 5bcb2468..073f09a6 100644 --- a/test/cache/test_cache_twobit.cpp +++ b/test/cache/test_cache_twobit.cpp @@ -230,7 +230,7 @@ BOOST_AUTO_TEST_CASE(test_cache) // GENERIC-HEADER "\x0F\x0A\x46\x53"s// [0, 3] "\x00\x00\x00\x00"s// [4, 7] version - "\x00\x01"s// [8, 9] FASTAFS flag [ 00000000 | 00000001 ] + "\x80\x00"s// [8, 9] FASTAFS flag [ 10000000 | 00000000 ] "\x00\x00\x01\x37"s // [10, 13] index position in file (153) // DATA @@ -292,25 +292,25 @@ BOOST_AUTO_TEST_CASE(test_cache) // INDEX "\x00\x00\x00\x07"s // [339, 342] 7 sequences - "\x00\x08" // [343, 344] complete, DNA and not circular + "\x010\x00" // [343, 344] complete, DNA and not circular "\x04"s "chr1"s // [345, 349] name "\x00\x00\x00\x0E"s // [350, 353] data position in file (14) - "\x00\x08" // [354, 355] complete, DNA and not circular + "\x010\x00" // [354, 355] complete, DNA and not circular "\x04"s "chr2"s // [356, 360] name "\x00\x00\x00\x36"s // [361, 364] data position in file (54) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x06"s "chr3.1"s // [, ] name "\x00\x00\x00\x65"s // [, ] data position in file (101) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x06"s "chr3.2"s // [, ] name "\x00\x00\x00\x8D"s // [, ] data position in file (141) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x06"s "chr3.3"s // [, ] name "\x00\x00\x00\xB5"s // [, ] data position in file (181) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x04"s "chr4"s // [, ] name "\x00\x00\x00\xDD"s // [, ] data position in file (221) - "\x00\x08" // [, ] complete, DNA and not circular + "\x010\x00" // [, ] complete, DNA and not circular "\x04"s "chr5"s // [, ] name "\x00\x00\x01\x0A"s // [, ] data position in file (290) diff --git a/test/flags/test_flags.cpp b/test/flags/test_flags.cpp index 418058b5..f7801f23 100644 --- a/test/flags/test_flags.cpp +++ b/test/flags/test_flags.cpp @@ -175,6 +175,26 @@ BOOST_AUTO_TEST_CASE(test_fastafs_sequence_flags) BOOST_CHECK_EQUAL(fs.is_complete(), false); BOOST_CHECK_EQUAL(fs.is_incomplete(), true); + + + // get characters + fs.set_incomplete(); + fs.set_linear(); + fs.set_dna(); + + std::array bits = fs.get_bits(); + BOOST_CHECK_EQUAL(bits[0], '\0'); + BOOST_CHECK_EQUAL(bits[1], '\0'); + + + fs.set_complete(); + fs.set_circular(); + fs.set_iupec_nucleotide(); + + bits = fs.get_bits(); + BOOST_CHECK_EQUAL(bits[0], '\x58');// 1011000 + BOOST_CHECK_EQUAL(bits[1], '\0'); + } BOOST_AUTO_TEST_SUITE_END() From 186b7a514c904ee10eec38d0a938b928ad785aeb Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 18 Dec 2019 14:56:25 +0100 Subject: [PATCH 028/119] using flag objects for setting flags --- src/fasta_to_fourbit_fastafs.cpp | 23 +++++++++++++++++++---- src/fasta_to_twobit_fastafs.cpp | 2 -- test/cache/test_cache_fourbit.cpp | 4 ++-- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index 79b2d242..f003dd46 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -4,6 +4,7 @@ #include "config.hpp" #include "fasta_to_fourbit_fastafs.hpp" +#include "flags.hpp" #include "utils.hpp" @@ -133,6 +134,9 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string std::vector index; fasta_seq_header_fourbit_conversion_data* s; + fastafs_flags ffsf; + ffsf.set_incomplete(); + // @todo use ifstream and ofstream argument types std::string line; std::ifstream fh_fasta(fasta_file.c_str(), std::ios :: in | std::ios :: binary); @@ -141,7 +145,11 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string if(fh_fasta.is_open() and fh_fastafs.is_open()) { fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; - fh_fastafs << "\x00\x00"s;// the flag for now, set to INCOMPLETE as writing is in progress + + // the flag for now, set to INCOMPLETE as writing is in progress || spacer that will be overwritten later + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; + fh_fastafs << "\x00\x00\x00\x00"s;// position of metedata ~ unknown YET // iterate until first sequence is found, ensuring we won't write to uninitialized sequences @@ -527,8 +535,13 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string for(size_t i = 0; i < index.size(); i++) { s = index[i]; - // flag - fh_fastafs << "\x00\x0A"s;// 00001010 (IUPEC + completed-with-checksum) | this probably has to be mirrored as last and first bit are swapped + // set and write flag + fastafs_sequence_flags fsf; + fsf.set_linear(); + fsf.set_iupec_nucleotide(); + fsf.set_complete(); + fh_fastafs << fsf.get_bits()[0]; + fh_fastafs << fsf.get_bits()[1]; // name unsigned char name_size = (unsigned char) s->name.size(); @@ -544,7 +557,9 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string // update header: set to updated fh_fastafs.seekp(8, std::ios::beg); - fh_fastafs << "\x00\x01"s; // updated flag + ffsf.set_complete(); + fh_fastafs << ffsf.get_bits()[0]; + fh_fastafs << ffsf.get_bits()[1]; uint_to_fourbytes(buffer, index_file_position);//position of header fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index 67efa253..117db552 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -310,12 +310,10 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f // update header: set to updated fh_fastafs.seekp(8, std::ios::beg); - ffsf.set_complete(); fh_fastafs << ffsf.get_bits()[0]; fh_fastafs << ffsf.get_bits()[1]; - uint_to_fourbytes(buffer, index_file_position);//position of header fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index dccf451b..ca9ab3aa 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -147,7 +147,7 @@ BOOST_AUTO_TEST_CASE(test_cache) // GENERIC-HEADER "\x0F\x0A\x46\x53"s// [0, 3] "\x00\x00\x00\x00"s// [4, 7] version - "\x00\x01"s// [8, 9] FASTAFS flag [ 00000000 | 00000001 ] + "\x80\x00"s// [8, 9] FASTAFS flag [ 00000000 | 00000001 ] "\x00\x00\x00\x68"s // [10, 13] index position in file (104?) // DATA @@ -168,7 +168,7 @@ BOOST_AUTO_TEST_CASE(test_cache) // INDEX "\x00\x00\x00\x01"s // [104, ] 1 sequences - "\x00\x0A" // [, ] complete, IUPEC + "\x50\x00" // [, ] complete, IUPEC [01010000] "\x05"s "IUPAC"s // [, ] name "\x00\x00\x00\x0E"s // [, ] data position in file (14) From c6480d10b90344721408dc4bad91ceb42ce97764 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 18 Dec 2019 15:30:38 +0100 Subject: [PATCH 029/119] sav --- include/fastafs.hpp | 5 +- include/flags.hpp | 8 +++ src/fastafs.cpp | 29 +++++--- src/flags.cpp | 2 + test/CMakeLists.txt | 2 +- test/view/test_view.cpp | 155 +++++++++++++++++++++++----------------- 6 files changed, 123 insertions(+), 78 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 0e04b007..36706419 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -60,7 +60,7 @@ class fastafs_seq uint32_t n;// number nucleotides std::vector n_starts;// start positions (nucleotide positions; 0-based) std::vector n_ends;// end positions (nucleotide positions; 0-based) - fastafs_flags flags; + fastafs_sequence_flags flags; std::vector m_starts;// start positions (nucleotide positions; 0-based) std::vector m_ends;// end positions (nucleotide positions; 0-based) @@ -106,7 +106,8 @@ class fastafs std::string name; std::string filename; std::vector data; - uint16_t flag; + + fastafs_flags flags; uint32_t n(); diff --git a/include/flags.hpp b/include/flags.hpp index b76bb0d2..b9614353 100644 --- a/include/flags.hpp +++ b/include/flags.hpp @@ -92,6 +92,14 @@ class fastafs_sequence_flags : public twobit_flag return !this->is_circular(); }; + bool is_twobit() + { + return (this->is_dna() | this->is_rna()); + }; + bool is_fourbit() + { + return this->is_iupec_nucleotide(); + }; // set by entity diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 27e96744..e475696f 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -31,6 +31,7 @@ #include "twobit_byte.hpp" #include "fastafs.hpp" +//#include "flags.hpp" #include "utils.hpp" @@ -524,7 +525,11 @@ void fastafs::load(std::string afilename) } } - this->flag = twobytes_to_uint(&memblock[8]); + this->flags.set(&memblock[8]); + if(this->flags.is_incomplete()) { + throw std::invalid_argument("Incomplete FASTAFS file (probably terminated during conversion): " + filename); + } + std::streampos file_cursor = (std::streampos) fourbytes_to_uint(&memblock[10], 0); // INDEX @@ -539,8 +544,7 @@ void fastafs::load(std::string afilename) // flag file.read(memblock, 2); - s->flags = fastafs_flags();//twobytes_to_uint(memblock); - s->flags.set(memblock); + s->flags.set(memblock);// should be initialized during construction of this class // name length file.read(memblock, 1); @@ -564,7 +568,12 @@ void fastafs::load(std::string afilename) s->n = fourbytes_to_uint(memblock, 0); // skip nucleotides - file.seekg((uint32_t) s->data_position + 4 + ((s->n + 3) / 4), file.beg); + if(s->flags.is_twobit()) { // there fit 4 twobits in a byte, thus divide by 4, + file.seekg((uint32_t) s->data_position + 4 + ((s->n + 3) / 4), file.beg); + } + else if(s->flags.is_fourbit()) { // there fit 2 fourbits in a byte, thus divide by 2, + file.seekg((uint32_t) s->data_position + 4 + ((s->n + 3) / 2), file.beg); + } // N-blocks (and update this->n instantly) file.read(memblock, 4); @@ -581,10 +590,12 @@ void fastafs::load(std::string afilename) s->n += s->n_ends[j] - s->n_starts[j] + 1; } - // MD5-checksum - file.read(memblock, 16); - for(int j = 0; j < 16 ; j ++) { - s->md5_digest[j] = memblock[j]; + // MD5-checksum - only if sequence is complete + if(s->flags.is_complete()) { + file.read(memblock, 16); + for(int j = 0; j < 16 ; j ++) { + s->md5_digest[j] = memblock[j]; + } } // M-blocks @@ -601,9 +612,11 @@ void fastafs::load(std::string afilename) s->m_ends[j] = fourbytes_to_uint(memblock, 0); } } + file.seekg(file_cursor, file.beg); this->data[i] = s; } + file.close(); delete[] memblock; } diff --git a/src/flags.cpp b/src/flags.cpp index 4b117000..f8da710f 100644 --- a/src/flags.cpp +++ b/src/flags.cpp @@ -107,6 +107,8 @@ bool fastafs_sequence_flags::is_circular() { + + void fastafs_sequence_flags::set_dna() { this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, false); // 0,0 this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, false); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e9c23267..7007baf7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -22,7 +22,7 @@ add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/ add_executable(test_fourbit_byte fourbit_byte/test_fourbit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_cache_twobit cache/test_cache_twobit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_cache_fourbit cache/test_cache_fourbit.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_flags flags/test_flags.cpp ../src/flags.cpp ../src/utils.cpp) add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index 5d852948..b693445f 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -9,6 +9,7 @@ #include "config.hpp" #include "fasta_to_twobit_fastafs.hpp" +#include "fasta_to_fourbit_fastafs.hpp" #include "fastafs.hpp" @@ -348,6 +349,8 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) delete cache_p999; } + + BOOST_AUTO_TEST_CASE(test_chunked_viewing_sub) { uint32_t written; @@ -389,81 +392,99 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_sub) -BOOST_AUTO_TEST_CASE(test_chunked_viewing2) -{ - std::string test_name = "test_003"; - std::string fasta_file = "test/data/" + test_name + ".fa"; - std::string fastafs_file = "tmp/" + test_name + ".fastafs"; +//BOOST_AUTO_TEST_CASE(test_chunked_viewing2) +//{ + //std::string test_name = "test_003"; + //std::string fasta_file = "test/data/" + test_name + ".fa"; + //std::string fastafs_file = "tmp/" + test_name + ".fastafs"; - fasta_to_twobit_fastafs(fasta_file, fastafs_file); - fastafs fs = fastafs(test_name); - fs.load(fastafs_file); + //fasta_to_twobit_fastafs(fasta_file, fastafs_file); + //fastafs fs = fastafs(test_name); + //fs.load(fastafs_file); - uint32_t written; - char *buffer = new char[2110];// file size on disk is 2108 bytes - flush_buffer(buffer, 2110, '\0'); + //BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); - std::string std_buffer; - std::ifstream fh(fasta_file.c_str()); - BOOST_REQUIRE(fh.is_open()); + //uint32_t written; + //char *buffer = new char[2110];// file size on disk is 2108 bytes + //flush_buffer(buffer, 2110, '\0'); - size_t size; + //std::string std_buffer; + //std::ifstream fh(fasta_file.c_str()); + //BOOST_REQUIRE(fh.is_open()); - fh.seekg(0, std::ios::end); - size = fh.tellg(); + //size_t size; - BOOST_REQUIRE_EQUAL(size, 2108); + //fh.seekg(0, std::ios::end); + //size = fh.tellg(); - fh.seekg(0, std::ios::beg); - fh.read(buffer, 2108); - fh.close(); - std::string full_file = std::string(buffer); - - BOOST_REQUIRE_EQUAL(full_file.size(), 2108); - - flush_buffer(buffer, 2110, '?'); - ffs2f_init* cache = fs.init_ffs2f(60, true); - - - /* maak alle substrings: - [....] - [...] - [..] - [.] - [...] - [..] - [.] - [..] - [.] - [.] - - */ - for(uint32_t start_pos = 0; start_pos < full_file.size(); start_pos++) { - for(uint32_t buffer_len = (uint32_t) full_file.size() - start_pos; buffer_len > 0; buffer_len--) { - std::string substr_file = std::string(full_file, start_pos, buffer_len); - - written = fs.view_fasta_chunk_cached(cache, buffer, buffer_len, start_pos); - std_buffer = std::string(buffer, substr_file.size()); - BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << start_pos << " and of length: " << buffer_len); - BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << start_pos << " and of length: " << buffer_len); - /* debug - if(std_buffer.compare(substr_file) != 0) { - printf(" %d: %d \n", start_pos, buffer_len); - - std::cout << "---- ref: ----\n"; - std::cout << substr_file << "\n"; - std::cout << "----found:----\n"; - std::cout << std_buffer << "\n"; - std::cout << "--------------\n"; - - exit(1); - }*/ - flush_buffer(buffer, 2110, '?'); - } - } + //BOOST_REQUIRE_EQUAL(size, 2108); - delete[] buffer; - delete cache; + //fh.seekg(0, std::ios::beg); + //fh.read(buffer, 2108); + //fh.close(); + //std::string full_file = std::string(buffer); + + //BOOST_REQUIRE_EQUAL(full_file.size(), 2108); + + //flush_buffer(buffer, 2110, '?'); + //ffs2f_init* cache = fs.init_ffs2f(60, true); + + + ///* maak alle substrings: + //[....] + //[...] + //[..] + //[.] + //[...] + //[..] + //[.] + //[..] + //[.] + //[.] + + //*/ + //for(uint32_t start_pos = 0; start_pos < full_file.size(); start_pos++) { + //for(uint32_t buffer_len = (uint32_t) full_file.size() - start_pos; buffer_len > 0; buffer_len--) { + //std::string substr_file = std::string(full_file, start_pos, buffer_len); + + //written = fs.view_fasta_chunk_cached(cache, buffer, buffer_len, start_pos); + //std_buffer = std::string(buffer, substr_file.size()); + //BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << start_pos << " and of length: " << buffer_len); + //BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << start_pos << " and of length: " << buffer_len); + ///* debug + //if(std_buffer.compare(substr_file) != 0) { + //printf(" %d: %d \n", start_pos, buffer_len); + + //std::cout << "---- ref: ----\n"; + //std::cout << substr_file << "\n"; + //std::cout << "----found:----\n"; + //std::cout << std_buffer << "\n"; + //std::cout << "--------------\n"; + + //exit(1); + //}*/ + //flush_buffer(buffer, 2110, '?'); + //} + //} + + //delete[] buffer; + //delete cache; +//} + + + + +BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) +{ + std::string test_name = "test_004"; + std::string fasta_file = "test/data/" + test_name + ".fa"; + std::string fastafs_file = "tmp/" + test_name + ".fastafs"; + + fasta_to_fourbit_fastafs(fasta_file, fastafs_file); + fastafs fs = fastafs(test_name); + fs.load(fastafs_file); + + BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); } From 6bf33d282104e7c75af13c43f57a7aec8effe271 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 19 Dec 2019 09:11:00 +0100 Subject: [PATCH 030/119] memsafe realpath --- src/fastafs.cpp | 10 ++- src/utils.cpp | 18 ++--- test/view/test_view.cpp | 175 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+), 10 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index e475696f..3a50901e 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -61,6 +61,8 @@ uint32_t fastafs_seq::fasta_filesize(uint32_t padding) return 1 + (uint32_t) this->name.size() + 1 + this->n + (this->n + (padding - 1)) / padding; } + + void fastafs_seq::view_fasta(ffs2f_init_seq* cache, std::ifstream *fh) { char buffer[READ_BUFFER_SIZE];// = new char [READ_BUFFER_SIZE]; @@ -566,6 +568,7 @@ void fastafs::load(std::string afilename) // n compressed nucleotides file.read(memblock, 4); s->n = fourbytes_to_uint(memblock, 0); + printf(" s->n: %u %i \n", s->n, s->n); // skip nucleotides if(s->flags.is_twobit()) { // there fit 4 twobits in a byte, thus divide by 4, @@ -1103,7 +1106,11 @@ size_t fastafs::fasta_filesize(uint32_t padding) //if(file.is_open()) { // file.close(); + printf("this->n() = %u %i\n", this->n() , this->n()); + printf("data.size: %i\n", this->data.size()); + for(size_t i = 0; i < this->data.size(); i++) { + printf(" s->n = %u %i\n", this->data[i]->n , this->data[i]->n); n += this->data[i]->fasta_filesize(padding); } @@ -1398,8 +1405,9 @@ uint32_t fastafs::n() } - +/* std::string fastafs::basename() { return ""; } +*/ diff --git a/src/utils.cpp b/src/utils.cpp index be424498..bf607046 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -186,7 +186,11 @@ std::string basename_cpp(std::string fn) //printf("basename: [%s]\n", filename); //std::cout << "basenamepp: |" << filenamepp << "|\n"; - return std::string(filename); + std::string filename_cpp = std::string(filename); + delete[] ts; + delete[] filename; + + return filename_cpp; } @@ -194,12 +198,8 @@ std::string basename_cpp(std::string fn) // https://stackoverflow.com/questions/38456127/what-is-the-value-of-cplusplus-for-c17 - THEN use std::filesystem::canonical(filename) std::string realpath_cpp(std::string fn) { - //std::string out = "asd"; - char *path = realpath(fn.c_str(), NULL); - //printf("realpath: [%s]\n", path); - - //std::string realpathpp = std::string(path); - //std::cout << "realpath: |" << realpathpp << "|\n"; - - return std::string(path); + char buf[1024]; + char *path = realpath(fn.c_str(), buf); + + return std::string(buf); } diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index b693445f..429d7d07 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -485,6 +485,181 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) fs.load(fastafs_file); BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); + BOOST_REQUIRE_EQUAL(fs.fasta_filesize(999), 98); + + + char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer + flush_buffer(buffer, 100, '?'); + + ffs2f_init* cache_p999 = fs.init_ffs2f(999, false); + //uint32_t written = fs.view_fasta_chunk_cached(cache_p999, buffer, 100, 0); + //BOOST_CHECK_EQUAL(written, 100); + // std_buffer = std::string(buffer, 100); + // //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG + // //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + //BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\n"), 0); + //flush_buffer(buffer, 100, '?'); + + /* + char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer + std::string std_buffer; + + // init caches + ffs2f_init* cache_p1 = fs.init_ffs2f(1, false); + ffs2f_init* cache_p4 = fs.init_ffs2f(4, false); + ffs2f_init* cache_p5 = fs.init_ffs2f(5, false); + ffs2f_init* cache_p999 = fs.init_ffs2f(999, false); + + // padding: 4 + + written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 0); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG + //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\n"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 999 - longer than longest seq + written = fs.view_fasta_chunk_cached(cache_p999, buffer, 100, 0); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTTCCCCAAAAGGGG >chr2 ACTGACTGNNNNACTG >chr3.1 ACTGACTGAAAAC >chr3.2 ACTGACTGAAAACC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG + //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTTCCCCAAAAGGGG\n>chr2\nACTGACTGNNNNACTG\n>chr3.1\nACTGACTGAAAAC\n>chr3.2\nACTGACTGAAAACC\n>chr3.3\nA"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 5 - see if 2bit works + written = fs.view_fasta_chunk_cached(cache_p5, buffer, 100, 0); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTTC CCCAA AAGGG G >chr2 ACTGA CTGNN NNACT G >chr3.1 ACTGA CTGAA AAC >chr3.2 ACTGA CTGAA AACC >chr3.3 ACTGA CTGAA AACCC >chr4 ACTGN NNN >chr5 NNACT G + //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTTC\nCCCAA\nAAGGG\nG\n>chr2\nACTGA\nCTGNN\nNNACT\nG\n>chr3.1\nACTGA\nCTGAA\nAAC\n>chr3.2\nACTGA\nCTGAA\nAACC"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1 + written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 0); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\n"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 1 + written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 1); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //X----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("chr1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 2 + written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 2); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //XX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("hr1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\n"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 3 + written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 3); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //XXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("r1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\nA"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 4 + written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 4); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //XXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\nA\n"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 1, offset 5 + written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 5); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G + //XXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\nA\nA"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 6 + written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 6); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("TTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>ch"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 7 + written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 7); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("TTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 8 + written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 8); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("TT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3"), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 9 + written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 9); + BOOST_CHECK_EQUAL(written, 100); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(std_buffer.compare("T\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3."), 0); + flush_buffer(buffer, 100, '?'); + + // padding: 4, offset: 10 + written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 10); + std_buffer = std::string(buffer, 100); + //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG + //XXXXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| + BOOST_CHECK_EQUAL(written, 100); + BOOST_CHECK_EQUAL(std_buffer.compare("\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3.3"), 0); + flush_buffer(buffer, 100, '?'); + + std::string full_file = ">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3.3\nACTG\nACTG\nAAAA\nCCC\n>chr4\nACTG\nNNNN\n>chr5\nNNAC\nTG\n"; + //std::string full_file = ">chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG "; + for(uint32_t offset = 0; offset < 62; ++offset) { + std::string substr_file = full_file.substr(offset, 100); + + written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, offset); + std_buffer = std::string(buffer, substr_file.size()); + + BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << offset); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << offset); + + flush_buffer(buffer, 100, '?'); + } + + delete[] buffer; + + delete cache_p1; + delete cache_p4; + delete cache_p5; + delete cache_p999; + + + * */ } From 3e9505180f0921a3774b85704d4e5e06b9a0f4df Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Sat, 21 Dec 2019 15:22:12 +0100 Subject: [PATCH 031/119] removal of obsolete includes and allow compilation on gcc 6.3 --- src/fastafs.cpp | 2 +- src/fuse.cpp | 2 +- src/lsfastafs.cpp | 2 +- src/ucsc2bit.cpp | 2 +- src/utils.cpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 3a50901e..a7617e63 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +//#include #include #include diff --git a/src/fuse.cpp b/src/fuse.cpp index b81801e2..ee99f07a 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +//#include #include "fuse.hpp" diff --git a/src/lsfastafs.cpp b/src/lsfastafs.cpp index 0a9a33c1..2c3f54da 100644 --- a/src/lsfastafs.cpp +++ b/src/lsfastafs.cpp @@ -1,4 +1,4 @@ -#include +//#include #include #include #include diff --git a/src/ucsc2bit.cpp b/src/ucsc2bit.cpp index 43f06ae2..41bd5cff 100644 --- a/src/ucsc2bit.cpp +++ b/src/ucsc2bit.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +//#include #include "config.hpp" diff --git a/src/utils.cpp b/src/utils.cpp index bf607046..32cd4baa 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -199,7 +199,7 @@ std::string basename_cpp(std::string fn) std::string realpath_cpp(std::string fn) { char buf[1024]; - char *path = realpath(fn.c_str(), buf); + realpath(fn.c_str(), buf); return std::string(buf); } From fda59274d6755a125d085e0ea9a7d3e670ee7f25 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Sat, 21 Dec 2019 15:50:09 +0100 Subject: [PATCH 032/119] small out of mem --- src/fastafs.cpp | 25 ++++++++++++++++++++++++- test/view/test_view.cpp | 20 +++++++++++++++----- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index a7617e63..b1ffbb48 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -531,6 +531,18 @@ void fastafs::load(std::string afilename) if(this->flags.is_incomplete()) { throw std::invalid_argument("Incomplete FASTAFS file (probably terminated during conversion): " + filename); } + + /* + unsigned char bits; + unsigned char bits_per_byte; + if(this->flags.is_twobit()) { + bits = 2; + bits_per_byte = 4; + } + else { + bits = 4; + bits_per_byte = 2; + }*/ std::streampos file_cursor = (std::streampos) fourbytes_to_uint(&memblock[10], 0); @@ -575,12 +587,14 @@ void fastafs::load(std::string afilename) file.seekg((uint32_t) s->data_position + 4 + ((s->n + 3) / 4), file.beg); } else if(s->flags.is_fourbit()) { // there fit 2 fourbits in a byte, thus divide by 2, - file.seekg((uint32_t) s->data_position + 4 + ((s->n + 3) / 2), file.beg); + file.seekg((uint32_t) s->data_position + 4 + ((s->n + 1) / 2), file.beg); } + printf(" s->n: %u %i [post skip]\n", s->n, s->n); // N-blocks (and update this->n instantly) file.read(memblock, 4); uint32_t N_blocks = fourbytes_to_uint(memblock, 0); + printf(" N blocks: %u %i \n", N_blocks, N_blocks); s->n_starts.resize(N_blocks); s->n_ends.resize(N_blocks); for(j = 0; j < s->n_starts.size(); j++) { @@ -592,6 +606,8 @@ void fastafs::load(std::string afilename) s->n_ends[j] = fourbytes_to_uint(memblock, 0); s->n += s->n_ends[j] - s->n_starts[j] + 1; } + printf(" s->n: %u %i [post n]\n", s->n, s->n); + // MD5-checksum - only if sequence is complete if(s->flags.is_complete()) { @@ -600,6 +616,7 @@ void fastafs::load(std::string afilename) s->md5_digest[j] = memblock[j]; } } + printf(" s->n: %u %i [post m5]\n", s->n, s->n); // M-blocks file.read(memblock, 4); @@ -614,14 +631,20 @@ void fastafs::load(std::string afilename) file.read(memblock, 4); s->m_ends[j] = fourbytes_to_uint(memblock, 0); } + printf(" s->n: %u %i [post M]\n", s->n, s->n); } file.seekg(file_cursor, file.beg); + printf(" s->n: %u %i \n", s->n, s->n); this->data[i] = s; + printf(" data[i]->n: %u %i \n", this->data[i]->n, this->data[i]->n); + printf("---\n"); } file.close(); delete[] memblock; + + printf("safe exist?!\n"); } } else { throw std::invalid_argument("Unable to open file '" + afilename + "'"); diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index 429d7d07..df8cf161 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -485,13 +485,23 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) fs.load(fastafs_file); BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); - BOOST_REQUIRE_EQUAL(fs.fasta_filesize(999), 98); - - char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer - flush_buffer(buffer, 100, '?'); + printf("number sequences in 4bit fs: %i\n", fs.data.size()); + printf("number sequences in 4bit fs[0] nucleotides: %i\n", fs.data[0]->n); - ffs2f_init* cache_p999 = fs.init_ffs2f(999, false); + BOOST_REQUIRE_EQUAL(fs.fasta_filesize(999), 98); + + printf("number sequences in 4bit fs: %i\n", fs.data.size()); + printf("number sequences in 4bit fs[0] nucleotides: %i\n", fs.data[0]->n); + + + //char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer + //flush_buffer(buffer, 100, '?'); + + //ffs2f_init* cache_p999 = fs.init_ffs2f(999, false); + + + //uint32_t written = fs.view_fasta_chunk_cached(cache_p999, buffer, 100, 0); //BOOST_CHECK_EQUAL(written, 100); // std_buffer = std::string(buffer, 100); From 21ce9634b1e83d20c1a112569deb86b57b6734a9 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 10:41:51 +0100 Subject: [PATCH 033/119] sav --- src/utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils.cpp b/src/utils.cpp index bf607046..32cd4baa 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -199,7 +199,7 @@ std::string basename_cpp(std::string fn) std::string realpath_cpp(std::string fn) { char buf[1024]; - char *path = realpath(fn.c_str(), buf); + realpath(fn.c_str(), buf); return std::string(buf); } From af058dac47d2dc7e6a7f39eea18c4de902e851fb Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 11:00:10 +0100 Subject: [PATCH 034/119] sav --- src/fastafs.cpp | 4 +--- src/main.cpp | 2 +- test/view/test_view.cpp | 14 +++++++------- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index b1ffbb48..fbe81d48 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -58,6 +58,7 @@ uint32_t fastafs_seq::fasta_filesize(uint32_t padding) } #endif // > chr \n ACTG NNN /number of newlines corresponding to ACTG NNN lines + return 1 + (uint32_t) this->name.size() + 1 + this->n + (this->n + (padding - 1)) / padding; } @@ -1129,11 +1130,8 @@ size_t fastafs::fasta_filesize(uint32_t padding) //if(file.is_open()) { // file.close(); - printf("this->n() = %u %i\n", this->n() , this->n()); - printf("data.size: %i\n", this->data.size()); for(size_t i = 0; i < this->data.size(); i++) { - printf(" s->n = %u %i\n", this->data[i]->n , this->data[i]->n); n += this->data[i]->fasta_filesize(padding); } diff --git a/src/main.cpp b/src/main.cpp index 1e4697ec..8e17c870 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -83,7 +83,7 @@ int main(int argc, char *argv[]) std::cout << PACKAGE << " v" << PACKAGE_VERSION << GIT_SHA1_STRING << "-release\n\n"; #endif //DEBUG - std::cout << "Copyright (C) 2017 Youri Hoogstrate." << "\n"; + std::cout << "Copyright (C) 2017 Dr. Youri Hoogstrate." << "\n"; std::cout << "License GPLv2+: GNU GPL version 2 or later .\n"; std::cout << "This is free software: you are free to change and redistribute it.\n"; std::cout << "There is NO WARRANTY, to the extent permitted by law.\n\n"; diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index df8cf161..3b532644 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -486,17 +486,17 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); - printf("number sequences in 4bit fs: %i\n", fs.data.size()); - printf("number sequences in 4bit fs[0] nucleotides: %i\n", fs.data[0]->n); + printf("number sequences in 4bit fs: %u\n", (unsigned int) fs.data.size()); + printf("number sequences in 4bit fs[0] nucleotides: %u\n", fs.data[0]->n); - BOOST_REQUIRE_EQUAL(fs.fasta_filesize(999), 98); + BOOST_REQUIRE_EQUAL(fs.fasta_filesize(32), 98); - printf("number sequences in 4bit fs: %i\n", fs.data.size()); - printf("number sequences in 4bit fs[0] nucleotides: %i\n", fs.data[0]->n); + printf("number sequences in 4bit fs: %u\n", (unsigned int) fs.data.size()); + printf("number sequences in 4bit fs[0] nucleotides: %u\n", fs.data[0]->n); - //char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer - //flush_buffer(buffer, 100, '?'); + char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer + flush_buffer(buffer, 100, '?'); //ffs2f_init* cache_p999 = fs.init_ffs2f(999, false); From 35f147d9341610bbdc4e7152c6c7ce903def0e3a Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 11:35:00 +0100 Subject: [PATCH 035/119] small changes --- include/fastafs.hpp | 2 ++ src/fastafs.cpp | 25 ++++++++++++++++++++++++- test/view/test_view.cpp | 2 +- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 36706419..f9af14b2 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -75,11 +75,13 @@ class fastafs_seq void view_fasta(ffs2f_init_seq*, std::ifstream *); uint32_t view_fasta_chunk_cached(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); + uint32_t view_fasta_chunk_cached_twobit(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); std::string sha1(ffs2f_init_seq*, std::ifstream*);// sha1 works 'fine' but is, like md5, sensitive to length extension hacks and should actually not be used for identifiers. std::string md5(ffs2f_init_seq*, std::ifstream*);// md5 works 'fine' but is, like sha1, sensitive to length extension hacks and should actually not be used for identifiers. uint32_t n_twobits(); + uint32_t n_fourbits(); static uint32_t n_padding(uint32_t, uint32_t, uint32_t); bool get_n_offset(uint32_t, uint32_t *); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index fbe81d48..278239c0 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -110,6 +110,7 @@ ffs2f_init_seq* fastafs_seq::init_ffs2f_seq(const uint32_t padding_arg, bool all data->n_starts[i] = fasta_header_size + this->n_starts[i] + (this->n_starts[i] / padding); data->n_ends[i] = fasta_header_size + this->n_ends[i] + (this->n_ends[i] / padding); } + block_size = data->n_starts.size(); data->n_starts[block_size - 1] = max_val; data->n_ends[block_size - 1] = max_val; @@ -129,6 +130,28 @@ ffs2f_init_seq* fastafs_seq::init_ffs2f_seq(const uint32_t padding_arg, bool all return data; } + + +// @todo templating like stuff +uint32_t fastafs_seq::view_fasta_chunk_cached( + ffs2f_init_seq* cache, + char *buffer, + + size_t buffer_size, + off_t start_pos_in_fasta, + + std::ifstream *fh) +{ + if(this->flags.is_dna()) { + return this->view_fasta_chunk_cached_twobit(cache, buffer, buffer_size, start_pos_in_fasta, fh); + } + else { + std::runtime_error("[fastafs_seq::view_fasta_chunk_cached] no 4-bit support yet\n"); + + return 0; + } +} + /* * fastafs_seq::view_fasta_chunk_cached - * @@ -143,7 +166,7 @@ ffs2f_init_seq* fastafs_seq::init_ffs2f_seq(const uint32_t padding_arg, bool all * * @todo see if this can be a std::ifstream or some kind of stream type of object? */ -uint32_t fastafs_seq::view_fasta_chunk_cached( +uint32_t fastafs_seq::view_fasta_chunk_cached_twobit( ffs2f_init_seq* cache, char *buffer, diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index 3b532644..4af9b005 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -498,7 +498,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer flush_buffer(buffer, 100, '?'); - //ffs2f_init* cache_p999 = fs.init_ffs2f(999, false); + ffs2f_init* cache_p32 = fs.init_ffs2f(32, false); From b411d9d7c2dadca2bacb06b3c27f4afa36b622e7 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 11:42:39 +0100 Subject: [PATCH 036/119] asd --- src/fastafs.cpp | 6 +++++- test/view/test_view.cpp | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 278239c0..d76444ab 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -142,11 +142,15 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( std::ifstream *fh) { + printf("check: "); if(this->flags.is_dna()) { + printf("A\n "); return this->view_fasta_chunk_cached_twobit(cache, buffer, buffer_size, start_pos_in_fasta, fh); } else { - std::runtime_error("[fastafs_seq::view_fasta_chunk_cached] no 4-bit support yet\n"); + printf("B\n "); + + throw std::runtime_error("[fastafs_seq::view_fasta_chunk_cached] no 4-bit support yet\n"); return 0; } diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index 4af9b005..a6affed0 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -502,7 +502,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) - //uint32_t written = fs.view_fasta_chunk_cached(cache_p999, buffer, 100, 0); + uint32_t written = fs.view_fasta_chunk_cached(cache_p32, buffer, 100, 0); //BOOST_CHECK_EQUAL(written, 100); // std_buffer = std::string(buffer, 100); // //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG From b5bde4aa06062f8f5a65735bb27423616731d668 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 12:43:30 +0100 Subject: [PATCH 037/119] nice steps towards x-bit templating --- CMakeLists.txt | 2 + include/fastafs.hpp | 3 + include/fourbit_byte.hpp | 5 +- include/twobit_byte.hpp | 5 +- src/fastafs.cpp | 162 ++++++++++++++++++++++++++++++++++-- src/fourbit_byte.cpp | 6 +- src/twobit_byte.cpp | 6 +- src/ucsc2bit.cpp | 2 +- src/ucsc2bit_to_fastafs.cpp | 2 +- test/CMakeLists.txt | 12 +-- test/view/test_view.cpp | 3 +- 11 files changed, 186 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f8aa6cf..29d60522 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,7 @@ add_executable(fastafs src/fastafs.cpp src/ucsc2bit.cpp src/twobit_byte.cpp + src/fourbit_byte.cpp src/database.cpp src/utils.cpp src/fuse.cpp @@ -122,6 +123,7 @@ add_executable(mount.fastafs src/fastafs.cpp src/ucsc2bit.cpp src/twobit_byte.cpp + src/fourbit_byte.cpp src/database.cpp src/utils.cpp src/fuse.cpp diff --git a/include/fastafs.hpp b/include/fastafs.hpp index f9af14b2..dd126efb 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -76,6 +76,9 @@ class fastafs_seq uint32_t view_fasta_chunk_cached(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); uint32_t view_fasta_chunk_cached_twobit(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); + + // T : twobit_byte or fourbit_byte + template uint32_t view_fasta_chunk_cached_fourbit(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); std::string sha1(ffs2f_init_seq*, std::ifstream*);// sha1 works 'fine' but is, like md5, sensitive to length extension hacks and should actually not be used for identifiers. std::string md5(ffs2f_init_seq*, std::ifstream*);// md5 works 'fine' but is, like sha1, sensitive to length extension hacks and should actually not be used for identifiers. diff --git a/include/fourbit_byte.hpp b/include/fourbit_byte.hpp index aa826406..f8c42e2e 100644 --- a/include/fourbit_byte.hpp +++ b/include/fourbit_byte.hpp @@ -9,7 +9,10 @@ class fourbit_byte { public: static const char fourbit_alhpabet[17]; - static const char fourbit_hash[256][3]; + static const char encode_hash[256][3]; + + static const char bits_per_nucleotide = 4; + static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; unsigned char data; void set(unsigned char, unsigned char); diff --git a/include/twobit_byte.hpp b/include/twobit_byte.hpp index 805ddb17..0466d2b7 100644 --- a/include/twobit_byte.hpp +++ b/include/twobit_byte.hpp @@ -8,7 +8,10 @@ class twobit_byte { public: - static const char twobit_hash[256][5]; + static const char encode_hash[256][5]; + + static const char bits_per_nucleotide = 2; + static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; unsigned char data; void set(unsigned char, unsigned char); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index d76444ab..3be4406d 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -30,6 +30,7 @@ #include "config.hpp" #include "twobit_byte.hpp" +#include "fourbit_byte.hpp" #include "fastafs.hpp" //#include "flags.hpp" #include "utils.hpp" @@ -144,13 +145,11 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( { printf("check: "); if(this->flags.is_dna()) { - printf("A\n "); return this->view_fasta_chunk_cached_twobit(cache, buffer, buffer_size, start_pos_in_fasta, fh); + //return this->view_fasta_chunk_cached_fourbit(cache, buffer, buffer_size, start_pos_in_fasta, fh); } else { - printf("B\n "); - - throw std::runtime_error("[fastafs_seq::view_fasta_chunk_cached] no 4-bit support yet\n"); + return this->view_fasta_chunk_cached_fourbit(cache, buffer, buffer_size, start_pos_in_fasta, fh); return 0; } @@ -246,7 +245,7 @@ uint32_t fastafs_seq::view_fasta_chunk_cached_twobit( */ twobit_byte t = twobit_byte(); - const char *chunk = twobit_byte::twobit_hash[0]; + const char *chunk = twobit_byte::encode_hash[0]; unsigned char twobit_offset = (nucleotide_pos - n_passed) % 4; if(twobit_offset != 0) { fh->read((char*)(&t.data), 1); @@ -318,6 +317,159 @@ uint32_t fastafs_seq::view_fasta_chunk_cached_twobit( + +//@todo template T +template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( + ffs2f_init_seq* cache, + char *buffer, + + size_t buffer_size, + off_t start_pos_in_fasta, + + std::ifstream *fh) +{ +#if DEBUG + if(cache == nullptr) { + throw std::runtime_error("Empty cache was provided\n"); + } +#endif //DEBUG + + T t = T();// nice way of having this templated object on stack :) + uint32_t written = 0; + printf("Hello world - T(4bit)::nucleotides_per_byte = %i [ 2?!] \n", t.nucleotides_per_byte); + printf("Hello world - T(4bit)::nucleotides_per_byte = %i [ 2?!] \n", T::nucleotides_per_byte); + + + if(written >= buffer_size) { // requesting a buffer of size=0, should throw an exception? + return written; + } + + uint32_t pos = (uint32_t) start_pos_in_fasta; + uint32_t pos_limit = 0; + + // > + pos_limit += 1; + if(pos < pos_limit) { + buffer[written++] = '>'; + pos++; + if(written >= buffer_size) { + return written; + } + } + + // sequence name + pos_limit += (uint32_t) this->name.size(); + while(pos < pos_limit) { + buffer[written++] = this->name[this->name.size() - (pos_limit - pos)]; + pos++; + if(written >= buffer_size) { + return written; + } + } + + // \n + pos_limit += 1; + if(pos < pos_limit) { + buffer[written++] = '\n'; + pos++; + if(written >= buffer_size) { + return written; + } + } + + const uint32_t offset_from_sequence_line = pos - pos_limit; + size_t n_block = cache->n_starts.size(); + size_t m_block = cache->m_starts.size(); + uint32_t newlines_passed = offset_from_sequence_line / (cache->padding + 1);// number of newlines passed (within the sequence part) + uint32_t nucleotide_pos = offset_from_sequence_line - newlines_passed;// requested nucleotide in file + + // calculate file position for next twobit + // when we are in an OPEN n block, we need to go to the first non-N base after, and place the file pointer there + uint32_t n_passed = 0; + this->get_n_offset(nucleotide_pos, &n_passed); + fh->seekg((uint32_t) this->data_position + 4 + ((nucleotide_pos - n_passed) / 4), fh->beg); + /* + 0 0 0 0 1 1 1 1 << desired offset from starting point + A C T G A C T G + * + + handigste is om file pointer naar de byte ervoor te zetten + vervolgens wanneer twobit_offset gelijk is aan nul, lees je de volgende byte + * nooit out of bound + + */ + const char *chunk = T::encode_hash[0];// init + unsigned char twobit_offset = (nucleotide_pos - n_passed) % T::nucleotides_per_byte;// twobit -> 4, fourbit: -> 2 + if(twobit_offset != 0) { + fh->read((char*)(&t.data), 1); + chunk = t.get(); + } + while(n_block > 0 and pos <= cache->n_ends[n_block - 1]) { // iterate back + n_block--; + } + while(m_block > 0 and pos <= cache->m_ends[m_block - 1]) { // iterate back + m_block--; + } + + // write sequence + pos_limit += newlines_passed * (cache->padding + 1);// passed sequence-containg lines + while(newlines_passed < cache->total_sequence_containing_lines) { // only 'complete' lines that are guarenteed 'padding' number of nucleotides long [ this loop starts at one to be unsigned-safe ] + pos_limit += std::min(cache->padding, this->n - (newlines_passed * cache->padding));// only last line needs to be smaller ~ calculate from the beginning of newlines_passed + + // write nucleotides + while(pos < pos_limit) {// while next sequence-containing-line is open + if(pos >= cache->n_starts[n_block]) { + if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case + buffer[written++] = 'n'; + } else { + buffer[written++] = 'N'; + } + } else { + if(twobit_offset % 4 == 0) { + fh->read((char*)(&t.data), 1); + chunk = t.get(); + } + + if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case + buffer[written++] = (unsigned char)(chunk[twobit_offset] + 32); + } else { + buffer[written++] = chunk[twobit_offset]; + } + + twobit_offset = (unsigned char)(twobit_offset + 1) % 4; + } + if(pos == cache->n_ends[n_block]) { + n_block++; + } + if(pos == cache->m_ends[m_block]) { + m_block++; + } + pos++; + + if(written >= buffer_size) { + //fh->clear(); + return written; + } + } + + // write newline + pos_limit += 1; + if(pos < pos_limit) { + buffer[written++] = '\n'; + pos++; + if(written >= buffer_size) { + //fh->clear(); + return written; + } + } + newlines_passed++; + } + //fh->clear(); + return written; +} + + + /* CRAM specification: diff --git a/src/fourbit_byte.cpp b/src/fourbit_byte.cpp index da44544e..399d790d 100644 --- a/src/fourbit_byte.cpp +++ b/src/fourbit_byte.cpp @@ -21,7 +21,7 @@ binary: IUPEC */ const char fourbit_byte::fourbit_alhpabet[17] = "ACGTURYKMSWBDHVN"; -const char fourbit_byte::fourbit_hash[256][3] = {"AA", "AC", "AG", "AT", "AU", "AR", "AY", "AK", "AM", "AS", "AW", "AB", "AD", "AH", "AV", "AN", "CA", "CC", "CG", "CT", "CU", "CR", "CY", "CK", "CM", "CS", "CW", "CB", "CD", "CH", "CV", "CN", "GA", "GC", "GG", "GT", "GU", "GR", "GY", "GK", "GM", "GS", "GW", "GB", "GD", "GH", "GV", "GN", "TA", "TC", "TG", "TT", "TU", "TR", "TY", "TK", "TM", "TS", "TW", "TB", "TD", "TH", "TV", "TN", "UA", "UC", "UG", "UT", "UU", "UR", "UY", "UK", "UM", "US", "UW", "UB", "UD", "UH", "UV", "UN", "RA", "RC", "RG", "RT", "RU", "RR", "RY", "RK", "RM", "RS", "RW", "RB", "RD", "RH", "RV", "RN", "YA", "YC", "YG", "YT", "YU", "YR", "YY", "YK", "YM", "YS", "YW", "YB", "YD", "YH", "YV", "YN", "KA", "KC", "KG", "KT", "KU", "KR", "KY", "KK", "KM", "KS", "KW", "KB", "KD", "KH", "KV", "KN", "MA", "MC", "MG", "MT", "MU", "MR", "MY", "MK", "MM", "MS", "MW", "MB", "MD", "MH", "MV", "MN", "SA", "SC", "SG", "ST", "SU", "SR", "SY", "SK", "SM", "SS", "SW", "SB", "SD", "SH", "SV", "SN", "WA", "WC", "WG", "WT", "WU", "WR", "WY", "WK", "WM", "WS", "WW", "WB", "WD", "WH", "WV", "WN", "BA", "BC", "BG", "BT", "BU", "BR", "BY", "BK", "BM", "BS", "BW", "BB", "BD", "BH", "BV", "BN", "DA", "DC", "DG", "DT", "DU", "DR", "DY", "DK", "DM", "DS", "DW", "DB", "DD", "DH", "DV", "DN", "HA", "HC", "HG", "HT", "HU", "HR", "HY", "HK", "HM", "HS", "HW", "HB", "HD", "HH", "HV", "HN", "VA", "VC", "VG", "VT", "VU", "VR", "VY", "VK", "VM", "VS", "VW", "VB", "VD", "VH", "VV", "VN", "NA", "NC", "NG", "NT", "NU", "NR", "NY", "NK", "NM", "NS", "NW", "NB", "ND", "NH", "NV", "NN"}; +const char fourbit_byte::encode_hash[256][3] = {"AA", "AC", "AG", "AT", "AU", "AR", "AY", "AK", "AM", "AS", "AW", "AB", "AD", "AH", "AV", "AN", "CA", "CC", "CG", "CT", "CU", "CR", "CY", "CK", "CM", "CS", "CW", "CB", "CD", "CH", "CV", "CN", "GA", "GC", "GG", "GT", "GU", "GR", "GY", "GK", "GM", "GS", "GW", "GB", "GD", "GH", "GV", "GN", "TA", "TC", "TG", "TT", "TU", "TR", "TY", "TK", "TM", "TS", "TW", "TB", "TD", "TH", "TV", "TN", "UA", "UC", "UG", "UT", "UU", "UR", "UY", "UK", "UM", "US", "UW", "UB", "UD", "UH", "UV", "UN", "RA", "RC", "RG", "RT", "RU", "RR", "RY", "RK", "RM", "RS", "RW", "RB", "RD", "RH", "RV", "RN", "YA", "YC", "YG", "YT", "YU", "YR", "YY", "YK", "YM", "YS", "YW", "YB", "YD", "YH", "YV", "YN", "KA", "KC", "KG", "KT", "KU", "KR", "KY", "KK", "KM", "KS", "KW", "KB", "KD", "KH", "KV", "KN", "MA", "MC", "MG", "MT", "MU", "MR", "MY", "MK", "MM", "MS", "MW", "MB", "MD", "MH", "MV", "MN", "SA", "SC", "SG", "ST", "SU", "SR", "SY", "SK", "SM", "SS", "SW", "SB", "SD", "SH", "SV", "SN", "WA", "WC", "WG", "WT", "WU", "WR", "WY", "WK", "WM", "WS", "WW", "WB", "WD", "WH", "WV", "WN", "BA", "BC", "BG", "BT", "BU", "BR", "BY", "BK", "BM", "BS", "BW", "BB", "BD", "BH", "BV", "BN", "DA", "DC", "DG", "DT", "DU", "DR", "DY", "DK", "DM", "DS", "DW", "DB", "DD", "DH", "DV", "DN", "HA", "HC", "HG", "HT", "HU", "HR", "HY", "HK", "HM", "HS", "HW", "HB", "HD", "HH", "HV", "HN", "VA", "VC", "VG", "VT", "VU", "VR", "VY", "VK", "VM", "VS", "VW", "VB", "VD", "VH", "VV", "VN", "NA", "NC", "NG", "NT", "NU", "NR", "NY", "NK", "NM", "NS", "NW", "NB", "ND", "NH", "NV", "NN"}; /* @@ -233,7 +233,7 @@ char *fourbit_byte::get(unsigned char length) char *seq = new char[length + 1]; for(unsigned char i = 0; i < length; i++) { // length = 4: i = 0, 1, 2, 3 - seq[i] = fourbit_byte::fourbit_hash[this->data][i]; + seq[i] = fourbit_byte::encode_hash[this->data][i]; } seq[length] = '\0'; @@ -244,5 +244,5 @@ char *fourbit_byte::get(unsigned char length) const char *fourbit_byte::get() { - return fourbit_byte::fourbit_hash[this->data]; + return fourbit_byte::encode_hash[this->data]; } diff --git a/src/twobit_byte.cpp b/src/twobit_byte.cpp index 8878a533..38f389a7 100644 --- a/src/twobit_byte.cpp +++ b/src/twobit_byte.cpp @@ -5,7 +5,7 @@ #include "twobit_byte.hpp" -const char twobit_byte::twobit_hash[256][5] = {"TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT", "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC", "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA", "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG", "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT", "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC", "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA", "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG", "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT", "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC", "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA", "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG", "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT", "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC", "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA", "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG", "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT", "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC", "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA", "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG", "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT", "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC", "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA", "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG", "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT", "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC", "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA", "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG", "GGGT", "GGGC", "GGGA", "GGGG"}; +const char twobit_byte::encode_hash[256][5] = {"TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT", "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC", "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA", "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG", "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT", "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC", "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA", "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG", "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT", "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC", "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA", "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG", "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT", "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC", "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA", "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG", "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT", "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC", "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA", "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG", "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT", "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC", "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA", "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG", "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT", "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC", "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA", "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG", "GGGT", "GGGC", "GGGA", "GGGG"}; /* @@ -120,7 +120,7 @@ char *twobit_byte::get(unsigned char length) char *seq = new char[length + 1]; for(unsigned char i = 0; i < length; i++) { // length = 4: i = 0, 1, 2, 3 - seq[i] = twobit_byte::twobit_hash[this->data][i]; + seq[i] = twobit_byte::encode_hash[this->data][i]; } seq[length] = '\0'; @@ -131,5 +131,5 @@ char *twobit_byte::get(unsigned char length) const char *twobit_byte::get() { - return twobit_byte::twobit_hash[this->data]; + return twobit_byte::encode_hash[this->data]; } diff --git a/src/ucsc2bit.cpp b/src/ucsc2bit.cpp index 41bd5cff..ce26375b 100644 --- a/src/ucsc2bit.cpp +++ b/src/ucsc2bit.cpp @@ -96,7 +96,7 @@ uint32_t ucsc2bit_seq::view_fasta_chunk(uint32_t padding, char *buffer, size_t b fh->seekg((uint32_t) this->sequence_data_position + ((nucleotide_pos) / 4), std::ios::beg);// std::ios::beg | fh->beg twobit_byte t = twobit_byte(); - const char *chunk = twobit_byte::twobit_hash[0]; + const char *chunk = twobit_byte::encode_hash[0]; unsigned char twobit_offset = nucleotide_pos % 4; diff --git a/src/ucsc2bit_to_fastafs.cpp b/src/ucsc2bit_to_fastafs.cpp index 93c2b43b..7fc95dda 100644 --- a/src/ucsc2bit_to_fastafs.cpp +++ b/src/ucsc2bit_to_fastafs.cpp @@ -119,7 +119,7 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) // parse and convert sequence fh_ucsc2bit.read(buffer, 4); twobit_byte t_in = twobit_byte(); - const char *decoded_in = t_in.twobit_hash[0];// unnecessary initialization but otherwise gcc whines + const char *decoded_in = t_in.encode_hash[0];// unnecessary initialization but otherwise gcc whines twobit_byte t_out = twobit_byte(); uint32_t k = 0; // iterator in fastafs format diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 7007baf7..04e31b89 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,16 +18,16 @@ set(BUILD_DIR "../bin") set(BUILD_TEST_DIR "${BUILD_DIR}/test") -add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_fourbit_byte fourbit_byte/test_fourbit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_cache_twobit cache/test_cache_twobit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_cache_twobit cache/test_cache_twobit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_cache_fourbit cache/test_cache_fourbit.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_flags flags/test_flags.cpp ../src/flags.cpp ../src/utils.cpp) -add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/utils.cpp) +add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_utils utils/test_utils.cpp ../src/utils.cpp) #add_executable(test_tree tree/test_tree.cpp) diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index a6affed0..f07bf518 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -501,8 +501,9 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) ffs2f_init* cache_p32 = fs.init_ffs2f(32, false); - uint32_t written = fs.view_fasta_chunk_cached(cache_p32, buffer, 100, 0); + printf("\n------\n%s\n------\n", buffer); + //BOOST_CHECK_EQUAL(written, 100); // std_buffer = std::string(buffer, 100); // //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG From b974f0f0c7bcfb616e24e256c93575056b746fad Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 12:56:35 +0100 Subject: [PATCH 038/119] closer --- src/fastafs.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 3be4406d..742b6496 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -387,7 +387,7 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( // when we are in an OPEN n block, we need to go to the first non-N base after, and place the file pointer there uint32_t n_passed = 0; this->get_n_offset(nucleotide_pos, &n_passed); - fh->seekg((uint32_t) this->data_position + 4 + ((nucleotide_pos - n_passed) / 4), fh->beg); + fh->seekg((uint32_t) this->data_position + 4 + ((nucleotide_pos - n_passed) / T::bits_per_nucleotide), fh->beg); /* 0 0 0 0 1 1 1 1 << desired offset from starting point A C T G A C T G @@ -411,13 +411,20 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( m_block--; } + printf(" check \n"); + printf(" cache->total_sequence_containing_lines: %i \n", cache->total_sequence_containing_lines); + // write sequence pos_limit += newlines_passed * (cache->padding + 1);// passed sequence-containg lines while(newlines_passed < cache->total_sequence_containing_lines) { // only 'complete' lines that are guarenteed 'padding' number of nucleotides long [ this loop starts at one to be unsigned-safe ] + printf(" - entering line \n"); pos_limit += std::min(cache->padding, this->n - (newlines_passed * cache->padding));// only last line needs to be smaller ~ calculate from the beginning of newlines_passed + printf(" %i < %i (pos, pos_limit) \n", pos, pos_limit); + // write nucleotides while(pos < pos_limit) {// while next sequence-containing-line is open + printf(" twobit offset: %i\n", twobit_offset); if(pos >= cache->n_starts[n_block]) { if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case buffer[written++] = 'n'; @@ -425,7 +432,7 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( buffer[written++] = 'N'; } } else { - if(twobit_offset % 4 == 0) { + if(twobit_offset % T::nucleotides_per_byte == 0) { fh->read((char*)(&t.data), 1); chunk = t.get(); } @@ -436,7 +443,7 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( buffer[written++] = chunk[twobit_offset]; } - twobit_offset = (unsigned char)(twobit_offset + 1) % 4; + twobit_offset = (unsigned char)(twobit_offset + 1) % T::nucleotides_per_byte; } if(pos == cache->n_ends[n_block]) { n_block++; From 0e281a370ac551e81583499c26fea8ee67913e27 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 13:00:35 +0100 Subject: [PATCH 039/119] only masking (upper/lowercase) not working well --- include/fourbit_byte.hpp | 2 ++ include/twobit_byte.hpp | 2 ++ src/fastafs.cpp | 4 ++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/fourbit_byte.hpp b/include/fourbit_byte.hpp index f8c42e2e..89e46957 100644 --- a/include/fourbit_byte.hpp +++ b/include/fourbit_byte.hpp @@ -10,6 +10,8 @@ class fourbit_byte public: static const char fourbit_alhpabet[17]; static const char encode_hash[256][3]; + static const char n_fill_unmasked = '-'; + static const char n_fill_masked = '-'; static const char bits_per_nucleotide = 4; static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; diff --git a/include/twobit_byte.hpp b/include/twobit_byte.hpp index 0466d2b7..52b4a41b 100644 --- a/include/twobit_byte.hpp +++ b/include/twobit_byte.hpp @@ -9,6 +9,8 @@ class twobit_byte { public: static const char encode_hash[256][5]; + static const char n_fill_unmasked = 'N'; + static const char n_fill_masked = 'n'; static const char bits_per_nucleotide = 2; static const char nucleotides_per_byte = 8 / bits_per_nucleotide ; diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 742b6496..73fb3d2d 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -427,9 +427,9 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( printf(" twobit offset: %i\n", twobit_offset); if(pos >= cache->n_starts[n_block]) { if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case - buffer[written++] = 'n'; + buffer[written++] = T::n_fill_masked; } else { - buffer[written++] = 'N'; + buffer[written++] = T::n_fill_unmasked; } } else { if(twobit_offset % T::nucleotides_per_byte == 0) { From e0d95f0a2017d394ba9adcd24b35417abfdbd9c9 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 14:22:42 +0100 Subject: [PATCH 040/119] nice --- src/fastafs.cpp | 17 +++++++++-------- test/view/test_view.cpp | 15 ++++++--------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 73fb3d2d..1e97c3c3 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -394,13 +394,13 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( * handigste is om file pointer naar de byte ervoor te zetten - vervolgens wanneer twobit_offset gelijk is aan nul, lees je de volgende byte + vervolgens wanneer bit_offset gelijk is aan nul, lees je de volgende byte * nooit out of bound */ const char *chunk = T::encode_hash[0];// init - unsigned char twobit_offset = (nucleotide_pos - n_passed) % T::nucleotides_per_byte;// twobit -> 4, fourbit: -> 2 - if(twobit_offset != 0) { + unsigned char bit_offset = (nucleotide_pos - n_passed) % T::nucleotides_per_byte;// twobit -> 4, fourbit: -> 2 + if(bit_offset != 0) { fh->read((char*)(&t.data), 1); chunk = t.get(); } @@ -413,6 +413,8 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( printf(" check \n"); printf(" cache->total_sequence_containing_lines: %i \n", cache->total_sequence_containing_lines); + printf(" m-blocks: %i %i\n", this->m_starts.size(), cache->m_ends.size()); + printf(" m-block[0] %i ... %i %i\n", cache->m_starts[0], cache->m_ends[0], m_starts[m_block]); // write sequence pos_limit += newlines_passed * (cache->padding + 1);// passed sequence-containg lines @@ -424,7 +426,6 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( // write nucleotides while(pos < pos_limit) {// while next sequence-containing-line is open - printf(" twobit offset: %i\n", twobit_offset); if(pos >= cache->n_starts[n_block]) { if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case buffer[written++] = T::n_fill_masked; @@ -432,18 +433,18 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( buffer[written++] = T::n_fill_unmasked; } } else { - if(twobit_offset % T::nucleotides_per_byte == 0) { + if(bit_offset % T::nucleotides_per_byte == 0) { fh->read((char*)(&t.data), 1); chunk = t.get(); } if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case - buffer[written++] = (unsigned char)(chunk[twobit_offset] + 32); + buffer[written++] = (unsigned char)(chunk[bit_offset] + 32); } else { - buffer[written++] = chunk[twobit_offset]; + buffer[written++] = chunk[bit_offset]; } - twobit_offset = (unsigned char)(twobit_offset + 1) % T::nucleotides_per_byte; + bit_offset = (unsigned char)(bit_offset + 1) % T::nucleotides_per_byte; } if(pos == cache->n_ends[n_block]) { n_block++; diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index f07bf518..05be282b 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -498,18 +498,15 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer flush_buffer(buffer, 100, '?'); - ffs2f_init* cache_p32 = fs.init_ffs2f(32, false); + ffs2f_init* cache_p32 = fs.init_ffs2f(32, true);// allow masking = T uint32_t written = fs.view_fasta_chunk_cached(cache_p32, buffer, 100, 0); - printf("\n------\n%s\n------\n", buffer); - - //BOOST_CHECK_EQUAL(written, 100); - // std_buffer = std::string(buffer, 100); - // //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG - // //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - //BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\n"), 0); - //flush_buffer(buffer, 100, '?'); + + BOOST_CHECK_EQUAL(written, 98); + std::string std_buffer = std::string(buffer, 98); + BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 100, '?'); /* char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer From 4ef425266bc6d6e45ff48d6b7035583de4c56d81 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 14:23:38 +0100 Subject: [PATCH 041/119] sav --- src/fastafs.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 1e97c3c3..cc84139f 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -452,6 +452,7 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( if(pos == cache->m_ends[m_block]) { m_block++; } + pos++; if(written >= buffer_size) { @@ -465,13 +466,16 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( if(pos < pos_limit) { buffer[written++] = '\n'; pos++; + if(written >= buffer_size) { //fh->clear(); return written; } } + newlines_passed++; } + //fh->clear(); return written; } From 52db7faab98e092eb91499de09ede12dff666a29 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 14:36:43 +0100 Subject: [PATCH 042/119] right constant = right answer :) --- src/fastafs.cpp | 4 +- test/view/test_view.cpp | 218 +++++++++++++--------------------------- 2 files changed, 72 insertions(+), 150 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index cc84139f..4a68b325 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -336,8 +336,6 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( T t = T();// nice way of having this templated object on stack :) uint32_t written = 0; - printf("Hello world - T(4bit)::nucleotides_per_byte = %i [ 2?!] \n", t.nucleotides_per_byte); - printf("Hello world - T(4bit)::nucleotides_per_byte = %i [ 2?!] \n", T::nucleotides_per_byte); if(written >= buffer_size) { // requesting a buffer of size=0, should throw an exception? @@ -387,7 +385,7 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( // when we are in an OPEN n block, we need to go to the first non-N base after, and place the file pointer there uint32_t n_passed = 0; this->get_n_offset(nucleotide_pos, &n_passed); - fh->seekg((uint32_t) this->data_position + 4 + ((nucleotide_pos - n_passed) / T::bits_per_nucleotide), fh->beg); + fh->seekg((uint32_t) this->data_position + 4 + ((nucleotide_pos - n_passed) / T::nucleotides_per_byte), fh->beg); /* 0 0 0 0 1 1 1 1 << desired offset from starting point A C T G A C T G diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index 05be282b..9603ae4d 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -485,165 +485,89 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) fs.load(fastafs_file); BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); - - printf("number sequences in 4bit fs: %u\n", (unsigned int) fs.data.size()); - printf("number sequences in 4bit fs[0] nucleotides: %u\n", fs.data[0]->n); - BOOST_REQUIRE_EQUAL(fs.fasta_filesize(32), 98); - printf("number sequences in 4bit fs: %u\n", (unsigned int) fs.data.size()); - printf("number sequences in 4bit fs[0] nucleotides: %u\n", fs.data[0]->n); - - char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer - flush_buffer(buffer, 100, '?'); + char *buffer = new char[200];// buffer needs to be c buffer because of the fuse layer + flush_buffer(buffer, 200, '?'); + ffs2f_init* cache_p1 = fs.init_ffs2f(1, true); + ffs2f_init* cache_p4 = fs.init_ffs2f(4, true); + ffs2f_init* cache_p5 = fs.init_ffs2f(5, true); ffs2f_init* cache_p32 = fs.init_ffs2f(32, true);// allow masking = T - - - uint32_t written = fs.view_fasta_chunk_cached(cache_p32, buffer, 100, 0); - - BOOST_CHECK_EQUAL(written, 98); - std::string std_buffer = std::string(buffer, 98); - BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); - flush_buffer(buffer, 100, '?'); - - /* - char *buffer = new char[100];// buffer needs to be c buffer because of the fuse layer - std::string std_buffer; - - // init caches - ffs2f_init* cache_p1 = fs.init_ffs2f(1, false); - ffs2f_init* cache_p4 = fs.init_ffs2f(4, false); - ffs2f_init* cache_p5 = fs.init_ffs2f(5, false); - ffs2f_init* cache_p999 = fs.init_ffs2f(999, false); - - // padding: 4 - - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 0); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG - //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\n"), 0); - flush_buffer(buffer, 100, '?'); - - // padding: 999 - longer than longest seq - written = fs.view_fasta_chunk_cached(cache_p999, buffer, 100, 0); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 TTTTCCCCAAAAGGGG >chr2 ACTGACTGNNNNACTG >chr3.1 ACTGACTGAAAAC >chr3.2 ACTGACTGAAAACC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG - //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTTCCCCAAAAGGGG\n>chr2\nACTGACTGNNNNACTG\n>chr3.1\nACTGACTGAAAAC\n>chr3.2\nACTGACTGAAAACC\n>chr3.3\nA"), 0); - flush_buffer(buffer, 100, '?'); - - // padding: 5 - see if 2bit works - written = fs.view_fasta_chunk_cached(cache_p5, buffer, 100, 0); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 TTTTC CCCAA AAGGG G >chr2 ACTGA CTGNN NNACT G >chr3.1 ACTGA CTGAA AAC >chr3.2 ACTGA CTGAA AACC >chr3.3 ACTGA CTGAA AACCC >chr4 ACTGN NNN >chr5 NNACT G - //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nTTTTC\nCCCAA\nAAGGG\nG\n>chr2\nACTGA\nCTGNN\nNNACT\nG\n>chr3.1\nACTGA\nCTGAA\nAAC\n>chr3.2\nACTGA\nCTGAA\nAACC"), 0); - flush_buffer(buffer, 100, '?'); - - // padding: 1 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 0); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G - //----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare(">chr1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\n"), 0); - flush_buffer(buffer, 100, '?'); - - // padding: 1, offset 1 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 1); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G - //X----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare("chr1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA"), 0); - flush_buffer(buffer, 100, '?'); - - // padding: 1, offset 2 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 2); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G - //XX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare("hr1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\n"), 0); - flush_buffer(buffer, 100, '?'); + ffs2f_init* cache_p999 = fs.init_ffs2f(999, true); - // padding: 1, offset 3 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 3); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G - //XXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare("r1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\nA"), 0); - flush_buffer(buffer, 100, '?'); + std::string std_buffer; + uint32_t written; - // padding: 1, offset 4 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 4); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G - //XXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare("1\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\nA\n"), 0); - flush_buffer(buffer, 100, '?'); - // padding: 1, offset 5 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 5); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G - //XXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare("\nT\nT\nT\nT\nC\nC\nC\nC\nA\nA\nA\nA\nG\nG\nG\nG\n>chr2\nA\nC\nT\nG\nA\nC\nT\nG\nN\nN\nN\nN\nA\nC\nT\nG\n>chr3.1\nA\nC\nT\nG\nA\nC\nT\nG\nA\nA\nA"), 0); - flush_buffer(buffer, 100, '?'); - - // padding: 4, offset: 6 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 6); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG - //XXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare("TTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>ch"), 0); - flush_buffer(buffer, 100, '?'); + // padding = 32, offset = 0 + written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 0); + BOOST_CHECK_EQUAL(written, 98); + std_buffer = std::string(buffer, 98); + BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 1 + written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 1); + BOOST_CHECK_EQUAL(written, 97); + std_buffer = std::string(buffer, 97); + BOOST_CHECK_EQUAL(std_buffer.compare("IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 2 + written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 2); + BOOST_CHECK_EQUAL(written, 96); + std_buffer = std::string(buffer, 96); + BOOST_CHECK_EQUAL(std_buffer.compare("UPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 5 + written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 5); + BOOST_CHECK_EQUAL(written, 93); + std_buffer = std::string(buffer, 93); + BOOST_CHECK_EQUAL(std_buffer.compare("C\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 6 + written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 6); + BOOST_CHECK_EQUAL(written, 92); + std_buffer = std::string(buffer, 92); + BOOST_CHECK_EQUAL(std_buffer.compare("\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 7 + written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 7); + BOOST_CHECK_EQUAL(written, 91); + std_buffer = std::string(buffer, 91); + BOOST_CHECK_EQUAL(std_buffer.compare("NBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 8 + written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 8); + BOOST_CHECK_EQUAL(written, 90); + std_buffer = std::string(buffer, 90); + BOOST_CHECK_EQUAL(std_buffer.compare("BKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 9 + written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 9); + BOOST_CHECK_EQUAL(written, 89); + std_buffer = std::string(buffer, 89); + BOOST_CHECK_EQUAL(std_buffer.compare("KAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 32, offset = 10 + written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 10); + BOOST_CHECK_EQUAL(written, 88); + std_buffer = std::string(buffer, 88); + BOOST_CHECK_EQUAL(std_buffer.compare("AHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); - // padding: 4, offset: 7 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 7); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG - //XXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare("TTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr"), 0); - flush_buffer(buffer, 100, '?'); - // padding: 4, offset: 8 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 8); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG - //XXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare("TT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3"), 0); - flush_buffer(buffer, 100, '?'); - // padding: 4, offset: 9 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 9); - BOOST_CHECK_EQUAL(written, 100); - std_buffer = std::string(buffer, 100); - //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG - //XXXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(std_buffer.compare("T\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3."), 0); - flush_buffer(buffer, 100, '?'); - // padding: 4, offset: 10 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 10); - std_buffer = std::string(buffer, 100); - //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG - //XXXXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| - BOOST_CHECK_EQUAL(written, 100); - BOOST_CHECK_EQUAL(std_buffer.compare("\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3.3"), 0); - flush_buffer(buffer, 100, '?'); + /* std::string full_file = ">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3.3\nACTG\nACTG\nAAAA\nCCC\n>chr4\nACTG\nNNNN\n>chr5\nNNAC\nTG\n"; //std::string full_file = ">chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG "; From f505d93ab0005bd79e948ef6295f5beb16034833 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 14:40:19 +0100 Subject: [PATCH 043/119] sav --- src/fastafs.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 4a68b325..73bdbc28 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -409,19 +409,11 @@ template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( m_block--; } - printf(" check \n"); - printf(" cache->total_sequence_containing_lines: %i \n", cache->total_sequence_containing_lines); - printf(" m-blocks: %i %i\n", this->m_starts.size(), cache->m_ends.size()); - printf(" m-block[0] %i ... %i %i\n", cache->m_starts[0], cache->m_ends[0], m_starts[m_block]); - // write sequence pos_limit += newlines_passed * (cache->padding + 1);// passed sequence-containg lines while(newlines_passed < cache->total_sequence_containing_lines) { // only 'complete' lines that are guarenteed 'padding' number of nucleotides long [ this loop starts at one to be unsigned-safe ] - printf(" - entering line \n"); pos_limit += std::min(cache->padding, this->n - (newlines_passed * cache->padding));// only last line needs to be smaller ~ calculate from the beginning of newlines_passed - printf(" %i < %i (pos, pos_limit) \n", pos, pos_limit); - // write nucleotides while(pos < pos_limit) {// while next sequence-containing-line is open if(pos >= cache->n_starts[n_block]) { From 42f219d66b26ba47c66016afcb79fe51306905b8 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 14:44:50 +0100 Subject: [PATCH 044/119] cleanup --- src/fastafs.cpp | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 73bdbc28..b68670f0 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -143,7 +143,6 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( std::ifstream *fh) { - printf("check: "); if(this->flags.is_dna()) { return this->view_fasta_chunk_cached_twobit(cache, buffer, buffer_size, start_pos_in_fasta, fh); //return this->view_fasta_chunk_cached_fourbit(cache, buffer, buffer_size, start_pos_in_fasta, fh); @@ -762,7 +761,6 @@ void fastafs::load(std::string afilename) // n compressed nucleotides file.read(memblock, 4); s->n = fourbytes_to_uint(memblock, 0); - printf(" s->n: %u %i \n", s->n, s->n); // skip nucleotides if(s->flags.is_twobit()) { // there fit 4 twobits in a byte, thus divide by 4, @@ -771,12 +769,11 @@ void fastafs::load(std::string afilename) else if(s->flags.is_fourbit()) { // there fit 2 fourbits in a byte, thus divide by 2, file.seekg((uint32_t) s->data_position + 4 + ((s->n + 1) / 2), file.beg); } - printf(" s->n: %u %i [post skip]\n", s->n, s->n); // N-blocks (and update this->n instantly) file.read(memblock, 4); uint32_t N_blocks = fourbytes_to_uint(memblock, 0); - printf(" N blocks: %u %i \n", N_blocks, N_blocks); + s->n_starts.resize(N_blocks); s->n_ends.resize(N_blocks); for(j = 0; j < s->n_starts.size(); j++) { @@ -788,8 +785,6 @@ void fastafs::load(std::string afilename) s->n_ends[j] = fourbytes_to_uint(memblock, 0); s->n += s->n_ends[j] - s->n_starts[j] + 1; } - printf(" s->n: %u %i [post n]\n", s->n, s->n); - // MD5-checksum - only if sequence is complete if(s->flags.is_complete()) { @@ -798,7 +793,6 @@ void fastafs::load(std::string afilename) s->md5_digest[j] = memblock[j]; } } - printf(" s->n: %u %i [post m5]\n", s->n, s->n); // M-blocks file.read(memblock, 4); @@ -813,20 +807,14 @@ void fastafs::load(std::string afilename) file.read(memblock, 4); s->m_ends[j] = fourbytes_to_uint(memblock, 0); } - printf(" s->n: %u %i [post M]\n", s->n, s->n); } file.seekg(file_cursor, file.beg); - printf(" s->n: %u %i \n", s->n, s->n); this->data[i] = s; - printf(" data[i]->n: %u %i \n", this->data[i]->n, this->data[i]->n); - printf("---\n"); } file.close(); delete[] memblock; - - printf("safe exist?!\n"); } } else { throw std::invalid_argument("Unable to open file '" + afilename + "'"); From 44cce450abc4b45fc87648edb1ed2b8577a7c41a Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 15:03:02 +0100 Subject: [PATCH 045/119] sav --- src/fastafs.cpp | 1 + test/view/test_view.cpp | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index b68670f0..0651e794 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -833,6 +833,7 @@ void fastafs::view_fasta(ffs2f_init* cache) for(uint32_t i = 0; i < this->data.size(); i++) { this->data[i]->view_fasta(cache->sequences[i], &file); } + file.close(); } } diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index 9603ae4d..e36f20e1 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -565,6 +565,26 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) flush_buffer(buffer, 200, '?'); + // padding = 1, offset = 0 + written = fs.view_fasta_chunk_cached(cache_p1, buffer, 200, 0); + BOOST_CHECK_EQUAL(written, 183); + std_buffer = std::string(buffer, 183); + BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nN\nB\nK\nA\nH\nM\nD\nC\nU\nW\nG\nS\nY\nV\nT\nR\nH\nG\nW\nV\nU\nM\nT\nB\nS\nD\nN\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\nB\nG\nY\nA\nD\nN\nH\nS\nM\nU\nT\nR\nC\nK\nW\nV\ns\nb\nh\nv\nd\nn\nr\nt\ng\ny\nc\nm\nk\nw\nu\na\nA\nV\nT\nS\nD\nK\nN\nB\n-\n-\n-\nU\nG\nW\nM\nH\nY\nR\nC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 5, offset = 0 + written = fs.view_fasta_chunk_cached(cache_p5, buffer, 200, 0); + BOOST_CHECK_EQUAL(written, 113); + std_buffer = std::string(buffer, 113); + BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAH\nMDCUW\nGSYVT\nRHGWV\nUMTBS\nDN---\n-----\n--BGY\nADNHS\nMUTRC\nKWVsb\nhvdnr\ntgycm\nkwuaA\nVTSDK\nNB---\nUGWMH\nYRC\n"), 0); + flush_buffer(buffer, 200, '?'); + + // padding = 999, offset = 0 + written = fs.view_fasta_chunk_cached(cache_p999, buffer, 200, 0); + BOOST_CHECK_EQUAL(written, 96); + std_buffer = std::string(buffer, 96); + BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN----------BGYADNHSMUTRCKWVsbhvdnrtgycmkwuaAVTSDKNB---UGWMHYRC\n"), 0); + flush_buffer(buffer, 200, '?'); /* From 749f30042786e23c5a011675476e64d6aa445369 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 15:15:46 +0100 Subject: [PATCH 046/119] sav --- test/view/test_view.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index e36f20e1..9731c0ff 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -587,20 +587,18 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) flush_buffer(buffer, 200, '?'); - /* - std::string full_file = ">chr1\nTTTT\nCCCC\nAAAA\nGGGG\n>chr2\nACTG\nACTG\nNNNN\nACTG\n>chr3.1\nACTG\nACTG\nAAAA\nC\n>chr3.2\nACTG\nACTG\nAAAA\nCC\n>chr3.3\nACTG\nACTG\nAAAA\nCCC\n>chr4\nACTG\nNNNN\n>chr5\nNNAC\nTG\n"; - //std::string full_file = ">chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG "; + std::string full_file = ">IUPAC\nNBKA\nHMDC\nUWGS\nYVTR\nHGWV\nUMTB\nSDN-\n----\n----\n-BGY\nADNH\nSMUT\nRCKW\nVsbh\nvdnr\ntgyc\nmkwu\naAVT\nSDKN\nB---\nUGWM\nHYRC\n";// length = 117 for(uint32_t offset = 0; offset < 62; ++offset) { - std::string substr_file = full_file.substr(offset, 100); + std::string substr_file = full_file.substr(offset, 200); - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, offset); + written = fs.view_fasta_chunk_cached(cache_p4, buffer, 200, offset); std_buffer = std::string(buffer, substr_file.size()); BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << offset); BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << offset); - flush_buffer(buffer, 100, '?'); + flush_buffer(buffer, 200, '?'); } delete[] buffer; @@ -610,11 +608,9 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) delete cache_p5; delete cache_p999; - - * */ } - BOOST_AUTO_TEST_SUITE_END() + From 7e190855c275ae05335c498a2aa98de002bc5a21 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 15:25:03 +0100 Subject: [PATCH 047/119] sav --- include/fastafs.hpp | 5 +- src/fastafs.cpp | 161 ++-------------------------------------- test/view/test_view.cpp | 139 +++++++++++++++++----------------- 3 files changed, 74 insertions(+), 231 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index dd126efb..820f7ba3 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -75,10 +75,7 @@ class fastafs_seq void view_fasta(ffs2f_init_seq*, std::ifstream *); uint32_t view_fasta_chunk_cached(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); - uint32_t view_fasta_chunk_cached_twobit(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); - - // T : twobit_byte or fourbit_byte - template uint32_t view_fasta_chunk_cached_fourbit(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); + template uint32_t view_fasta_chunk_cached_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); std::string sha1(ffs2f_init_seq*, std::ifstream*);// sha1 works 'fine' but is, like md5, sensitive to length extension hacks and should actually not be used for identifiers. std::string md5(ffs2f_init_seq*, std::ifstream*);// md5 works 'fine' but is, like sha1, sensitive to length extension hacks and should actually not be used for identifiers. diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 0651e794..25fc041b 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -144,16 +144,15 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( std::ifstream *fh) { if(this->flags.is_dna()) { - return this->view_fasta_chunk_cached_twobit(cache, buffer, buffer_size, start_pos_in_fasta, fh); - //return this->view_fasta_chunk_cached_fourbit(cache, buffer, buffer_size, start_pos_in_fasta, fh); + return this->view_fasta_chunk_cached_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); } else { - return this->view_fasta_chunk_cached_fourbit(cache, buffer, buffer_size, start_pos_in_fasta, fh); - - return 0; + return this->view_fasta_chunk_cached_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); } } + + /* * fastafs_seq::view_fasta_chunk_cached - * @@ -168,157 +167,7 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( * * @todo see if this can be a std::ifstream or some kind of stream type of object? */ -uint32_t fastafs_seq::view_fasta_chunk_cached_twobit( - ffs2f_init_seq* cache, - char *buffer, - - size_t buffer_size, - off_t start_pos_in_fasta, - - std::ifstream *fh) -{ -#if DEBUG - if(cache == nullptr) { - throw std::runtime_error("Empty cache was provided\n"); - } -#endif //DEBUG - - uint32_t written = 0; - - if(written >= buffer_size) { // requesting a buffer of size=0, should throw an exception? - return written; - } - - uint32_t pos = (uint32_t) start_pos_in_fasta; - uint32_t pos_limit = 0; - - // > - pos_limit += 1; - if(pos < pos_limit) { - buffer[written++] = '>'; - pos++; - if(written >= buffer_size) { - return written; - } - } - - // sequence name - pos_limit += (uint32_t) this->name.size(); - while(pos < pos_limit) { - buffer[written++] = this->name[this->name.size() - (pos_limit - pos)]; - pos++; - if(written >= buffer_size) { - return written; - } - } - - // \n - pos_limit += 1; - if(pos < pos_limit) { - buffer[written++] = '\n'; - pos++; - if(written >= buffer_size) { - return written; - } - } - - const uint32_t offset_from_sequence_line = pos - pos_limit; - size_t n_block = cache->n_starts.size(); - size_t m_block = cache->m_starts.size(); - uint32_t newlines_passed = offset_from_sequence_line / (cache->padding + 1);// number of newlines passed (within the sequence part) - uint32_t nucleotide_pos = offset_from_sequence_line - newlines_passed;// requested nucleotide in file - - // calculate file position for next twobit - // when we are in an OPEN n block, we need to go to the first non-N base after, and place the file pointer there - uint32_t n_passed = 0; - this->get_n_offset(nucleotide_pos, &n_passed); - fh->seekg((uint32_t) this->data_position + 4 + ((nucleotide_pos - n_passed) / 4), fh->beg); - /* - 0 0 0 0 1 1 1 1 << desired offset from starting point - A C T G A C T G - * - - handigste is om file pointer naar de byte ervoor te zetten - vervolgens wanneer twobit_offset gelijk is aan nul, lees je de volgende byte - * nooit out of bound - - */ - twobit_byte t = twobit_byte(); - const char *chunk = twobit_byte::encode_hash[0]; - unsigned char twobit_offset = (nucleotide_pos - n_passed) % 4; - if(twobit_offset != 0) { - fh->read((char*)(&t.data), 1); - chunk = t.get(); - } - while(n_block > 0 and pos <= cache->n_ends[n_block - 1]) { // iterate back - n_block--; - } - while(m_block > 0 and pos <= cache->m_ends[m_block - 1]) { // iterate back - m_block--; - } - - // write sequence - pos_limit += newlines_passed * (cache->padding + 1);// passed sequence-containg lines - while(newlines_passed < cache->total_sequence_containing_lines) { // only 'complete' lines that are guarenteed 'padding' number of nucleotides long [ this loop starts at one to be unsigned-safe ] - pos_limit += std::min(cache->padding, this->n - (newlines_passed * cache->padding));// only last line needs to be smaller ~ calculate from the beginning of newlines_passed - - // write nucleotides - while(pos < pos_limit) {// while next sequence-containing-line is open - if(pos >= cache->n_starts[n_block]) { - if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case - buffer[written++] = 'n'; - } else { - buffer[written++] = 'N'; - } - } else { - if(twobit_offset % 4 == 0) { - fh->read((char*)(&t.data), 1); - chunk = t.get(); - } - - if(pos >= cache->m_starts[m_block]) { // IN an m block; lower-case - buffer[written++] = (unsigned char)(chunk[twobit_offset] + 32); - } else { - buffer[written++] = chunk[twobit_offset]; - } - - twobit_offset = (unsigned char)(twobit_offset + 1) % 4; - } - if(pos == cache->n_ends[n_block]) { - n_block++; - } - if(pos == cache->m_ends[m_block]) { - m_block++; - } - pos++; - - if(written >= buffer_size) { - //fh->clear(); - return written; - } - } - - // write newline - pos_limit += 1; - if(pos < pos_limit) { - buffer[written++] = '\n'; - pos++; - if(written >= buffer_size) { - //fh->clear(); - return written; - } - } - newlines_passed++; - } - //fh->clear(); - return written; -} - - - - -//@todo template T -template uint32_t fastafs_seq::view_fasta_chunk_cached_fourbit( +template uint32_t fastafs_seq::view_fasta_chunk_cached_generalized( ffs2f_init_seq* cache, char *buffer, diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index 9731c0ff..e90752fa 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -392,85 +392,83 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_sub) -//BOOST_AUTO_TEST_CASE(test_chunked_viewing2) -//{ - //std::string test_name = "test_003"; - //std::string fasta_file = "test/data/" + test_name + ".fa"; - //std::string fastafs_file = "tmp/" + test_name + ".fastafs"; - - //fasta_to_twobit_fastafs(fasta_file, fastafs_file); - //fastafs fs = fastafs(test_name); - //fs.load(fastafs_file); - - //BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); - - //uint32_t written; - //char *buffer = new char[2110];// file size on disk is 2108 bytes - //flush_buffer(buffer, 2110, '\0'); - - //std::string std_buffer; - //std::ifstream fh(fasta_file.c_str()); - //BOOST_REQUIRE(fh.is_open()); - - //size_t size; - - //fh.seekg(0, std::ios::end); - //size = fh.tellg(); - - //BOOST_REQUIRE_EQUAL(size, 2108); - - //fh.seekg(0, std::ios::beg); - //fh.read(buffer, 2108); - //fh.close(); - //std::string full_file = std::string(buffer); - - //BOOST_REQUIRE_EQUAL(full_file.size(), 2108); +BOOST_AUTO_TEST_CASE(test_chunked_viewing2) +{ + std::string test_name = "test_003"; + std::string fasta_file = "test/data/" + test_name + ".fa"; + std::string fastafs_file = "tmp/" + test_name + ".fastafs"; - //flush_buffer(buffer, 2110, '?'); - //ffs2f_init* cache = fs.init_ffs2f(60, true); + fasta_to_twobit_fastafs(fasta_file, fastafs_file); + fastafs fs = fastafs(test_name); + fs.load(fastafs_file); + BOOST_REQUIRE_EQUAL(fs.flags.is_complete(), true); - ///* maak alle substrings: - //[....] - //[...] - //[..] - //[.] - //[...] - //[..] - //[.] - //[..] - //[.] - //[.] + uint32_t written; + char *buffer = new char[2110];// file size on disk is 2108 bytes + flush_buffer(buffer, 2110, '\0'); - //*/ - //for(uint32_t start_pos = 0; start_pos < full_file.size(); start_pos++) { - //for(uint32_t buffer_len = (uint32_t) full_file.size() - start_pos; buffer_len > 0; buffer_len--) { - //std::string substr_file = std::string(full_file, start_pos, buffer_len); + std::string std_buffer; + std::ifstream fh(fasta_file.c_str()); + BOOST_REQUIRE(fh.is_open()); - //written = fs.view_fasta_chunk_cached(cache, buffer, buffer_len, start_pos); - //std_buffer = std::string(buffer, substr_file.size()); - //BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << start_pos << " and of length: " << buffer_len); - //BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << start_pos << " and of length: " << buffer_len); - ///* debug - //if(std_buffer.compare(substr_file) != 0) { - //printf(" %d: %d \n", start_pos, buffer_len); + size_t size; - //std::cout << "---- ref: ----\n"; - //std::cout << substr_file << "\n"; - //std::cout << "----found:----\n"; - //std::cout << std_buffer << "\n"; - //std::cout << "--------------\n"; + fh.seekg(0, std::ios::end); + size = fh.tellg(); - //exit(1); - //}*/ - //flush_buffer(buffer, 2110, '?'); - //} - //} + BOOST_REQUIRE_EQUAL(size, 2108); - //delete[] buffer; - //delete cache; -//} + fh.seekg(0, std::ios::beg); + fh.read(buffer, 2108); + fh.close(); + std::string full_file = std::string(buffer); + + BOOST_REQUIRE_EQUAL(full_file.size(), 2108); + + flush_buffer(buffer, 2110, '?'); + ffs2f_init* cache = fs.init_ffs2f(60, true); + + + /* maak alle substrings: + [....] + [...] + [..] + [.] + [...] + [..] + [.] + [..] + [.] + [.] + */ + for(uint32_t start_pos = 0; start_pos < full_file.size(); start_pos++) { + for(uint32_t buffer_len = (uint32_t) full_file.size() - start_pos; buffer_len > 0; buffer_len--) { + std::string substr_file = std::string(full_file, start_pos, buffer_len); + + written = fs.view_fasta_chunk_cached(cache, buffer, buffer_len, start_pos); + std_buffer = std::string(buffer, substr_file.size()); + BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << start_pos << " and of length: " << buffer_len); + BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << start_pos << " and of length: " << buffer_len); + /* debug + if(std_buffer.compare(substr_file) != 0) { + printf(" %d: %d \n", start_pos, buffer_len); + + std::cout << "---- ref: ----\n"; + std::cout << substr_file << "\n"; + std::cout << "----found:----\n"; + std::cout << std_buffer << "\n"; + std::cout << "--------------\n"; + + exit(1); + }*/ + flush_buffer(buffer, 2110, '?'); + } + } + delete[] buffer; + delete cache; +} @@ -607,7 +605,6 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) delete cache_p4; delete cache_p5; delete cache_p999; - } From f7ac903cf53540dcb44f1459c4aa27e89df40c97 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 15:27:26 +0100 Subject: [PATCH 048/119] sav --- src/fastafs.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 25fc041b..968198e9 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -855,6 +855,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi header_offset_previous++; } } + ffs2f_init* cache = this->init_ffs2f(0, false); // false, no masking needed, always upper-case is fine in this case for(i = 0; i < this->data.size(); i++) { sequence = this->data[i]; @@ -895,6 +896,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi return written; } } + uint_to_fourbytes_ucsc2bit(n_seq, sequence->n_ends[k] - sequence->n_starts[k] + 1); pos_limit += 4; while(pos < pos_limit) { @@ -913,6 +915,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi while(pos < pos_limit) { buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; + if(written >= buffer_size) { delete cache; return written; @@ -926,6 +929,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi while(pos < pos_limit) { buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; + if(written >= buffer_size) { delete cache; return written; @@ -937,6 +941,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi while(pos < pos_limit) { buffer[written++] = n_seq[4 - (pos_limit - pos)]; pos++; + if(written >= buffer_size) { delete cache; return written; @@ -949,6 +954,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi while(pos < pos_limit) { buffer[written++] = '\0'; pos++; + if(written >= buffer_size) { delete cache; return written; @@ -959,6 +965,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi uint32_t full_twobits = sequence->n / 4; twobit_byte t; pos_limit += full_twobits; + while(pos < pos_limit) { //printf("%i - %i = %i || %i\n",pos_limit,pos, (full_twobits - (pos_limit - pos)) * 4, j); //sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), 4, &file); From 09b4e3429fcb130bfabd6fac2de6be7f9d52a4aa Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 7 Jan 2020 15:31:22 +0100 Subject: [PATCH 049/119] tidyu --- include/fastafs.hpp | 2 +- include/flags.hpp | 75 ++++++------ src/fasta_to_twobit_fastafs.cpp | 2 +- src/fastafs.cpp | 36 +++--- src/flags.cpp | 140 ++++++++++++---------- src/utils.cpp | 2 +- test/flags/test_flags.cpp | 202 ++++++++++++++++---------------- test/view/test_view.cpp | 28 ++--- 8 files changed, 252 insertions(+), 235 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 820f7ba3..51ac1515 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -108,7 +108,7 @@ class fastafs std::string name; std::string filename; std::vector data; - + fastafs_flags flags; uint32_t n(); diff --git a/include/flags.hpp b/include/flags.hpp index b9614353..cdf837d4 100644 --- a/include/flags.hpp +++ b/include/flags.hpp @@ -18,56 +18,57 @@ const unsigned char FASTAFS_SEQUENCE_BITFLAG_CIRCULAR = 4; constexpr std::array bitmasks = { - 0b1000'0000, // represents bit 7 - 0b0100'0000, // represents bit 6 - 0b0010'0000, // represents bit 5 - 0b0001'0000, // represents bit 4 - 0b0000'1000, // represents bit 3 - 0b0000'0100, // represents bit 2 - 0b0000'0010, // represents bit 1 - 0b0000'0001, // represents bit 0 - - 0b1000'0000, // represents bit 7 - 0b0100'0000, // represents bit 6 - 0b0010'0000, // represents bit 5 - 0b0001'0000, // represents bit 4 - 0b0000'1000, // represents bit 3 - 0b0000'0100, // represents bit 2 - 0b0000'0010, // represents bit 1 - 0b0000'0001, // represents bit 0 + 0b1000'0000, // represents bit 7 + 0b0100'0000, // represents bit 6 + 0b0010'0000, // represents bit 5 + 0b0001'0000, // represents bit 4 + 0b0000'1000, // represents bit 3 + 0b0000'0100, // represents bit 2 + 0b0000'0010, // represents bit 1 + 0b0000'0001, // represents bit 0 + + 0b1000'0000, // represents bit 7 + 0b0100'0000, // represents bit 6 + 0b0010'0000, // represents bit 5 + 0b0001'0000, // represents bit 4 + 0b0000'1000, // represents bit 3 + 0b0000'0100, // represents bit 2 + 0b0000'0010, // represents bit 1 + 0b0000'0001, // represents bit 0 }; //#include "utils.hpp" -class twobit_flag { - protected: - twobit_flag(); - - std::array bits;// 00000000 00000000 +class twobit_flag +{ +protected: + twobit_flag(); + + std::array bits; // 00000000 00000000 - // set by flag - void set_flag(unsigned char, bool);// counting flag from bit 0(!) - bool get_flag(unsigned char); + // set by flag + void set_flag(unsigned char, bool);// counting flag from bit 0(!) + bool get_flag(unsigned char); - public: - void set(char *); - std::array &get_bits(void); // get bit 0 or bit 1 +public: + void set(char *); + std::array &get_bits(void); // get bit 0 or bit 1 }; class fastafs_flags : public twobit_flag { - public: - bool is_complete(); - bool is_incomplete() - { - return !this->is_complete(); - }; - - void set_complete(); - void set_incomplete(); +public: + bool is_complete(); + bool is_incomplete() + { + return !this->is_complete(); + }; + + void set_complete(); + void set_incomplete(); }; diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index 117db552..9f428fb2 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -131,7 +131,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f if(fh_fasta.is_open() and fh_fastafs.is_open()) { fh_fastafs << FASTAFS_MAGIC; fh_fastafs << FASTAFS_VERSION; - + // the flag for now, set to INCOMPLETE as writing is in progress || spacer that will be overwritten later fh_fastafs << ffsf.get_bits()[0]; fh_fastafs << ffsf.get_bits()[1]; diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 968198e9..4cf2632d 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -59,7 +59,7 @@ uint32_t fastafs_seq::fasta_filesize(uint32_t padding) } #endif // > chr \n ACTG NNN /number of newlines corresponding to ACTG NNN lines - + return 1 + (uint32_t) this->name.size() + 1 + this->n + (this->n + (padding - 1)) / padding; } @@ -145,8 +145,7 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( { if(this->flags.is_dna()) { return this->view_fasta_chunk_cached_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); - } - else { + } else { return this->view_fasta_chunk_cached_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); } } @@ -561,19 +560,19 @@ void fastafs::load(std::string afilename) if(this->flags.is_incomplete()) { throw std::invalid_argument("Incomplete FASTAFS file (probably terminated during conversion): " + filename); } - - /* - unsigned char bits; - unsigned char bits_per_byte; - if(this->flags.is_twobit()) { - bits = 2; - bits_per_byte = 4; - } - else { - bits = 4; - bits_per_byte = 2; - }*/ - + + /* + unsigned char bits; + unsigned char bits_per_byte; + if(this->flags.is_twobit()) { + bits = 2; + bits_per_byte = 4; + } + else { + bits = 4; + bits_per_byte = 2; + }*/ + std::streampos file_cursor = (std::streampos) fourbytes_to_uint(&memblock[10], 0); // INDEX @@ -614,8 +613,7 @@ void fastafs::load(std::string afilename) // skip nucleotides if(s->flags.is_twobit()) { // there fit 4 twobits in a byte, thus divide by 4, file.seekg((uint32_t) s->data_position + 4 + ((s->n + 3) / 4), file.beg); - } - else if(s->flags.is_fourbit()) { // there fit 2 fourbits in a byte, thus divide by 2, + } else if(s->flags.is_fourbit()) { // there fit 2 fourbits in a byte, thus divide by 2, file.seekg((uint32_t) s->data_position + 4 + ((s->n + 1) / 2), file.beg); } @@ -1156,7 +1154,7 @@ size_t fastafs::fasta_filesize(uint32_t padding) //if(file.is_open()) { // file.close(); - + for(size_t i = 0; i < this->data.size(); i++) { n += this->data[i]->fasta_filesize(padding); } diff --git a/src/flags.cpp b/src/flags.cpp index f8da710f..e1ec3911 100644 --- a/src/flags.cpp +++ b/src/flags.cpp @@ -6,9 +6,10 @@ -twobit_flag::twobit_flag() { - // ensure all bits are set, this prevents unexpected or undefined behaviour - this->bits[0] = '\0'; +twobit_flag::twobit_flag() +{ + // ensure all bits are set, this prevents unexpected or undefined behaviour + this->bits[0] = '\0'; this->bits[1] = '\0'; } @@ -23,85 +24,95 @@ void twobit_flag::set(char *data) // https://www.learncpp.com/cpp-tutorial/bit-manipulation-with-bitwise-operators-and-bit-masks/ -bool twobit_flag::get_flag(unsigned char bit) { +bool twobit_flag::get_flag(unsigned char bit) +{ #if DEBUG - if(bit >= 16) { - throw std::runtime_error("twobit_flag::get_flag = out of bound: " + std::to_string(bit) + "\n"); - } + if(bit >= 16) { + throw std::runtime_error("twobit_flag::get_flag = out of bound: " + std::to_string(bit) + "\n"); + } #endif //DEBUG - return (this->bits[bit / 8] & bitmasks[bit]); + return (this->bits[bit / 8] & bitmasks[bit]); } // https://www.learncpp.com/cpp-tutorial/bit-manipulation-with-bitwise-operators-and-bit-masks/ -void twobit_flag::set_flag(unsigned char bit, bool enable) { - if(bit >= 16) { - throw std::runtime_error("twobit_flag::set_flag = out of bound: " + std::to_string(bit) + "\n"); - } +void twobit_flag::set_flag(unsigned char bit, bool enable) +{ + if(bit >= 16) { + throw std::runtime_error("twobit_flag::set_flag = out of bound: " + std::to_string(bit) + "\n"); + } - if(enable) { // - //this->bits[bit / 8] |= bitmasks[bit]; - this->bits[bit / 8] = (unsigned char) (this->bits[bit / 8] | bitmasks[bit]); - } - else { - //this->bits[bit / 8] &= ~bitmasks[bit]; - this->bits[bit / 8] = (unsigned char) (this->bits[bit / 8] & ~bitmasks[bit]); - } + if(enable) { // + //this->bits[bit / 8] |= bitmasks[bit]; + this->bits[bit / 8] = (unsigned char)(this->bits[bit / 8] | bitmasks[bit]); + } else { + //this->bits[bit / 8] &= ~bitmasks[bit]; + this->bits[bit / 8] = (unsigned char)(this->bits[bit / 8] & ~bitmasks[bit]); + } } -std::array &twobit_flag::get_bits(void) { - return this->bits; +std::array &twobit_flag::get_bits(void) +{ + return this->bits; } -bool fastafs_flags::is_complete() { - return this->get_flag(FASTAFS_BITFLAG_COMPLETE); +bool fastafs_flags::is_complete() +{ + return this->get_flag(FASTAFS_BITFLAG_COMPLETE); } -void fastafs_flags::set_complete() { - this->set_flag(FASTAFS_BITFLAG_COMPLETE, true); +void fastafs_flags::set_complete() +{ + this->set_flag(FASTAFS_BITFLAG_COMPLETE, true); } -void fastafs_flags::set_incomplete() { - this->set_flag(FASTAFS_BITFLAG_COMPLETE, false); +void fastafs_flags::set_incomplete() +{ + this->set_flag(FASTAFS_BITFLAG_COMPLETE, false); } // alphabet: 'ACTG' + 'N' -bool fastafs_sequence_flags::is_dna() { - return ( - this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == false && - this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == false); +bool fastafs_sequence_flags::is_dna() +{ + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == false && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == false); } // alphabet: 'ACUG' + 'N' -bool fastafs_sequence_flags::is_rna() { - return ( - this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == true && - this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == false); +bool fastafs_sequence_flags::is_rna() +{ + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == true && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == false); } // alphabet: 'ACGTURYKMSWBDHVN' + '-' -bool fastafs_sequence_flags::is_iupec_nucleotide() { - return ( - this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == false && - this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == true); +bool fastafs_sequence_flags::is_iupec_nucleotide() +{ + return ( + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1) == false && + this->get_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2) == true); } -bool fastafs_sequence_flags::is_complete() { - return this->get_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE); +bool fastafs_sequence_flags::is_complete() +{ + return this->get_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE); } -bool fastafs_sequence_flags::is_circular() { - return this->get_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR); +bool fastafs_sequence_flags::is_circular() +{ + return this->get_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR); } @@ -109,35 +120,42 @@ bool fastafs_sequence_flags::is_circular() { -void fastafs_sequence_flags::set_dna() { - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, false); // 0,0 - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, false); +void fastafs_sequence_flags::set_dna() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, false); // 0,0 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, false); } -void fastafs_sequence_flags::set_rna() { - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, true); // 1,0 - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, false); +void fastafs_sequence_flags::set_rna() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, true); // 1,0 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, false); } -void fastafs_sequence_flags::set_iupec_nucleotide() { - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, false); // 0,1 - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, true); +void fastafs_sequence_flags::set_iupec_nucleotide() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_1, false); // 0,1 + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_SEQUENCE_TYPE_2, true); } -void fastafs_sequence_flags::set_complete() { - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE, true); +void fastafs_sequence_flags::set_complete() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE, true); } -void fastafs_sequence_flags::set_incomplete() { - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE, false); +void fastafs_sequence_flags::set_incomplete() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_COMPLETE, false); } -void fastafs_sequence_flags::set_linear() { - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR, false); +void fastafs_sequence_flags::set_linear() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR, false); } -void fastafs_sequence_flags::set_circular() { - this->set_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR, true); +void fastafs_sequence_flags::set_circular() +{ + this->set_flag(FASTAFS_SEQUENCE_BITFLAG_CIRCULAR, true); } diff --git a/src/utils.cpp b/src/utils.cpp index 32cd4baa..2e8b916e 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -200,6 +200,6 @@ std::string realpath_cpp(std::string fn) { char buf[1024]; realpath(fn.c_str(), buf); - + return std::string(buf); } diff --git a/test/flags/test_flags.cpp b/test/flags/test_flags.cpp index f7801f23..51e9a2a8 100644 --- a/test/flags/test_flags.cpp +++ b/test/flags/test_flags.cpp @@ -16,72 +16,72 @@ BOOST_AUTO_TEST_SUITE(Testing) BOOST_AUTO_TEST_CASE(test_fastafs_flags) { - fastafs_flags f; - - char buffer[2 + 1]; - buffer[2] = '\0'; - - // test: 00000000 00000000 - buffer[0] = '\x0'; - buffer[1] = '\x0'; - f.set(buffer); - + fastafs_flags f; + + char buffer[2 + 1]; + buffer[2] = '\0'; + + // test: 00000000 00000000 + buffer[0] = '\x0'; + buffer[1] = '\x0'; + f.set(buffer); + BOOST_CHECK_EQUAL(f.is_complete(), false); BOOST_CHECK_EQUAL(f.is_incomplete(), true); - // test: 10000000 00000000 - buffer[0] = '\x80'; // worked with writing to file and checking with `xxd -b file` ~ this is binary equivalent to 10000000 - buffer[1] = '\x0'; - f.set(buffer); - + // test: 10000000 00000000 + buffer[0] = '\x80'; // worked with writing to file and checking with `xxd -b file` ~ this is binary equivalent to 10000000 + buffer[1] = '\x0'; + f.set(buffer); + BOOST_CHECK_EQUAL(f.is_complete(), true); BOOST_CHECK_EQUAL(f.is_incomplete(), false); - // test: 11111111 00000000 - buffer[0] = '\xFF'; - buffer[1] = '\x0'; - f.set(buffer); - + // test: 11111111 00000000 + buffer[0] = '\xFF'; + buffer[1] = '\x0'; + f.set(buffer); + BOOST_CHECK_EQUAL(f.is_complete(), true); BOOST_CHECK_EQUAL(f.is_incomplete(), false); - // test: 00000001 00000000 - buffer[0] = '\x01'; - buffer[1] = '\x0'; - f.set(buffer); - + // test: 00000001 00000000 + buffer[0] = '\x01'; + buffer[1] = '\x0'; + f.set(buffer); + BOOST_CHECK_EQUAL(f.is_complete(), false); BOOST_CHECK_EQUAL(f.is_incomplete(), true); - // re-test: 00000000 00000000 - buffer[0] = '\x0'; - buffer[1] = '\x0'; - f.set(buffer); - + // re-test: 00000000 00000000 + buffer[0] = '\x0'; + buffer[1] = '\x0'; + f.set(buffer); + BOOST_CHECK_EQUAL(f.is_complete(), false); BOOST_CHECK_EQUAL(f.is_incomplete(), true); - - f.set_complete(); + + f.set_complete(); BOOST_CHECK_EQUAL(f.is_complete(), true); BOOST_CHECK_EQUAL(f.is_incomplete(), false); - f.set_complete(); + f.set_complete(); BOOST_CHECK_EQUAL(f.is_complete(), true); BOOST_CHECK_EQUAL(f.is_incomplete(), false); - f.set_complete(); + f.set_complete(); BOOST_CHECK_EQUAL(f.is_complete(), true); BOOST_CHECK_EQUAL(f.is_incomplete(), false); - f.set_incomplete(); + f.set_incomplete(); BOOST_CHECK_EQUAL(f.is_complete(), false); BOOST_CHECK_EQUAL(f.is_incomplete(), true); - f.set_incomplete(); + f.set_incomplete(); BOOST_CHECK_EQUAL(f.is_complete(), false); BOOST_CHECK_EQUAL(f.is_incomplete(), true); - f.set_incomplete(); + f.set_incomplete(); BOOST_CHECK_EQUAL(f.is_complete(), false); BOOST_CHECK_EQUAL(f.is_incomplete(), true); } @@ -89,111 +89,111 @@ BOOST_AUTO_TEST_CASE(test_fastafs_flags) BOOST_AUTO_TEST_CASE(test_fastafs_sequence_flags) { - fastafs_sequence_flags fs; - - fs.set_dna(); + fastafs_sequence_flags fs; + + fs.set_dna(); fs.set_rna(); fs.set_iupec_nucleotide(); - BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), true); - BOOST_CHECK_EQUAL(fs.is_dna(), false); - BOOST_CHECK_EQUAL(fs.is_rna(), false); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), true); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), false); fs.set_iupec_nucleotide(); fs.set_rna(); - fs.set_dna(); + fs.set_dna(); fs.set_rna(); fs.set_iupec_nucleotide(); - BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), true); - BOOST_CHECK_EQUAL(fs.is_dna(), false); - BOOST_CHECK_EQUAL(fs.is_rna(), false); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), true); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), false); fs.set_iupec_nucleotide(); fs.set_rna(); - fs.set_dna(); - fs.set_dna(); + fs.set_dna(); + fs.set_dna(); fs.set_iupec_nucleotide(); fs.set_rna(); - BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); - BOOST_CHECK_EQUAL(fs.is_dna(), false); - BOOST_CHECK_EQUAL(fs.is_rna(), true); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), true); fs.set_iupec_nucleotide(); fs.set_rna(); - fs.set_dna(); - fs.set_dna(); + fs.set_dna(); + fs.set_dna(); fs.set_iupec_nucleotide(); fs.set_rna(); - BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); - BOOST_CHECK_EQUAL(fs.is_dna(), false); - BOOST_CHECK_EQUAL(fs.is_rna(), true); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); + BOOST_CHECK_EQUAL(fs.is_dna(), false); + BOOST_CHECK_EQUAL(fs.is_rna(), true); fs.set_iupec_nucleotide(); fs.set_rna(); - fs.set_dna(); + fs.set_dna(); fs.set_iupec_nucleotide(); fs.set_rna(); - fs.set_dna(); - BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); - BOOST_CHECK_EQUAL(fs.is_dna(), true); - BOOST_CHECK_EQUAL(fs.is_rna(), false); + fs.set_dna(); + BOOST_CHECK_EQUAL(fs.is_iupec_nucleotide(), false); + BOOST_CHECK_EQUAL(fs.is_dna(), true); + BOOST_CHECK_EQUAL(fs.is_rna(), false); - fs.set_linear(); - BOOST_CHECK_EQUAL(fs.is_linear(), true); - BOOST_CHECK_EQUAL(fs.is_circular(), false); + fs.set_linear(); + BOOST_CHECK_EQUAL(fs.is_linear(), true); + BOOST_CHECK_EQUAL(fs.is_circular(), false); - fs.set_circular(); - fs.set_circular(); - fs.set_linear(); - BOOST_CHECK_EQUAL(fs.is_linear(), true); - BOOST_CHECK_EQUAL(fs.is_circular(), false); + fs.set_circular(); + fs.set_circular(); + fs.set_linear(); + BOOST_CHECK_EQUAL(fs.is_linear(), true); + BOOST_CHECK_EQUAL(fs.is_circular(), false); - fs.set_linear(); - fs.set_linear(); - fs.set_circular(); - BOOST_CHECK_EQUAL(fs.is_linear(), false); - BOOST_CHECK_EQUAL(fs.is_circular(), true); + fs.set_linear(); + fs.set_linear(); + fs.set_circular(); + BOOST_CHECK_EQUAL(fs.is_linear(), false); + BOOST_CHECK_EQUAL(fs.is_circular(), true); - fs.set_complete(); - BOOST_CHECK_EQUAL(fs.is_complete(), true); - BOOST_CHECK_EQUAL(fs.is_incomplete(), false); + fs.set_complete(); + BOOST_CHECK_EQUAL(fs.is_complete(), true); + BOOST_CHECK_EQUAL(fs.is_incomplete(), false); - fs.set_incomplete(); - fs.set_incomplete(); - fs.set_complete(); - BOOST_CHECK_EQUAL(fs.is_complete(), true); - BOOST_CHECK_EQUAL(fs.is_incomplete(), false); + fs.set_incomplete(); + fs.set_incomplete(); + fs.set_complete(); + BOOST_CHECK_EQUAL(fs.is_complete(), true); + BOOST_CHECK_EQUAL(fs.is_incomplete(), false); - fs.set_complete(); - fs.set_complete(); - fs.set_incomplete(); - BOOST_CHECK_EQUAL(fs.is_complete(), false); - BOOST_CHECK_EQUAL(fs.is_incomplete(), true); + fs.set_complete(); + fs.set_complete(); + fs.set_incomplete(); + BOOST_CHECK_EQUAL(fs.is_complete(), false); + BOOST_CHECK_EQUAL(fs.is_incomplete(), true); - // get characters - fs.set_incomplete(); - fs.set_linear(); - fs.set_dna(); + // get characters + fs.set_incomplete(); + fs.set_linear(); + fs.set_dna(); - std::array bits = fs.get_bits(); - BOOST_CHECK_EQUAL(bits[0], '\0'); - BOOST_CHECK_EQUAL(bits[1], '\0'); + std::array bits = fs.get_bits(); + BOOST_CHECK_EQUAL(bits[0], '\0'); + BOOST_CHECK_EQUAL(bits[1], '\0'); - fs.set_complete(); - fs.set_circular(); - fs.set_iupec_nucleotide(); + fs.set_complete(); + fs.set_circular(); + fs.set_iupec_nucleotide(); - bits = fs.get_bits(); - BOOST_CHECK_EQUAL(bits[0], '\x58');// 1011000 - BOOST_CHECK_EQUAL(bits[1], '\0'); + bits = fs.get_bits(); + BOOST_CHECK_EQUAL(bits[0], '\x58');// 1011000 + BOOST_CHECK_EQUAL(bits[1], '\0'); } diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index e90752fa..bb5255fe 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -495,67 +495,67 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) ffs2f_init* cache_p32 = fs.init_ffs2f(32, true);// allow masking = T ffs2f_init* cache_p999 = fs.init_ffs2f(999, true); - std::string std_buffer; - uint32_t written; + std::string std_buffer; + uint32_t written; - // padding = 32, offset = 0 + // padding = 32, offset = 0 written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 0); BOOST_CHECK_EQUAL(written, 98); std_buffer = std::string(buffer, 98); BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 32, offset = 1 + // padding = 32, offset = 1 written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 1); BOOST_CHECK_EQUAL(written, 97); std_buffer = std::string(buffer, 97); BOOST_CHECK_EQUAL(std_buffer.compare("IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 32, offset = 2 + // padding = 32, offset = 2 written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 2); BOOST_CHECK_EQUAL(written, 96); std_buffer = std::string(buffer, 96); BOOST_CHECK_EQUAL(std_buffer.compare("UPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 32, offset = 5 + // padding = 32, offset = 5 written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 5); BOOST_CHECK_EQUAL(written, 93); std_buffer = std::string(buffer, 93); BOOST_CHECK_EQUAL(std_buffer.compare("C\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 32, offset = 6 + // padding = 32, offset = 6 written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 6); BOOST_CHECK_EQUAL(written, 92); std_buffer = std::string(buffer, 92); BOOST_CHECK_EQUAL(std_buffer.compare("\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 32, offset = 7 + // padding = 32, offset = 7 written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 7); BOOST_CHECK_EQUAL(written, 91); std_buffer = std::string(buffer, 91); BOOST_CHECK_EQUAL(std_buffer.compare("NBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 32, offset = 8 + // padding = 32, offset = 8 written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 8); BOOST_CHECK_EQUAL(written, 90); std_buffer = std::string(buffer, 90); BOOST_CHECK_EQUAL(std_buffer.compare("BKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 32, offset = 9 + // padding = 32, offset = 9 written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 9); BOOST_CHECK_EQUAL(written, 89); std_buffer = std::string(buffer, 89); BOOST_CHECK_EQUAL(std_buffer.compare("KAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 32, offset = 10 + // padding = 32, offset = 10 written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 10); BOOST_CHECK_EQUAL(written, 88); std_buffer = std::string(buffer, 88); @@ -563,21 +563,21 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) flush_buffer(buffer, 200, '?'); - // padding = 1, offset = 0 + // padding = 1, offset = 0 written = fs.view_fasta_chunk_cached(cache_p1, buffer, 200, 0); BOOST_CHECK_EQUAL(written, 183); std_buffer = std::string(buffer, 183); BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nN\nB\nK\nA\nH\nM\nD\nC\nU\nW\nG\nS\nY\nV\nT\nR\nH\nG\nW\nV\nU\nM\nT\nB\nS\nD\nN\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\nB\nG\nY\nA\nD\nN\nH\nS\nM\nU\nT\nR\nC\nK\nW\nV\ns\nb\nh\nv\nd\nn\nr\nt\ng\ny\nc\nm\nk\nw\nu\na\nA\nV\nT\nS\nD\nK\nN\nB\n-\n-\n-\nU\nG\nW\nM\nH\nY\nR\nC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 5, offset = 0 + // padding = 5, offset = 0 written = fs.view_fasta_chunk_cached(cache_p5, buffer, 200, 0); BOOST_CHECK_EQUAL(written, 113); std_buffer = std::string(buffer, 113); BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAH\nMDCUW\nGSYVT\nRHGWV\nUMTBS\nDN---\n-----\n--BGY\nADNHS\nMUTRC\nKWVsb\nhvdnr\ntgycm\nkwuaA\nVTSDK\nNB---\nUGWMH\nYRC\n"), 0); flush_buffer(buffer, 200, '?'); - // padding = 999, offset = 0 + // padding = 999, offset = 0 written = fs.view_fasta_chunk_cached(cache_p999, buffer, 200, 0); BOOST_CHECK_EQUAL(written, 96); std_buffer = std::string(buffer, 96); From 69301e5b31db4031bd0bce2de411dded8eae0ffa Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 8 Jan 2020 16:11:39 +0100 Subject: [PATCH 050/119] sav --- test/cache/test_cache_fourbit.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index ca9ab3aa..7803cd71 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -193,10 +193,10 @@ BOOST_AUTO_TEST_CASE(test_cache) for(unsigned int i = 0; i < size; i++) { BOOST_CHECK_EQUAL(buffer[i], reference[i]); - if(reference[i] != buffer[i]) { - printf("comparing char %i\n", i); - printf(" ** mismatch [%d] [ref] %d != [buf] %d (%c x %02hhX)\n", i, reference[i], buffer[i], buffer[i], buffer[i]); - } + //if(reference[i] != buffer[i]) { + // printf("comparing char %u\n", i); + // printf(" ** mismatch [%d] [ref] %d != [buf] %d (%c x %02hhX)\n", i, reference[i], buffer[i], buffer[i], buffer[i]); + //} } From ec2f148ceb8ef145ed485cf0cfc5f2a160882cb2 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 8 Jan 2020 16:24:08 +0100 Subject: [PATCH 051/119] sav --- test/cache/test_cache_fourbit.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 7803cd71..fee801f4 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -189,7 +189,12 @@ BOOST_AUTO_TEST_CASE(test_cache) file.seekg(0, std::ios::beg); file.read(buffer, size); + BOOST_CHECK_EQUAL(file.gcount(), size); file.close(); + + //BOOST_CHECK_UNEQUAL(ret, -1); + + for(unsigned int i = 0; i < size; i++) { BOOST_CHECK_EQUAL(buffer[i], reference[i]); From b706b96cae064c07bc2a3e6ed71ecce04b548c1c Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 8 Jan 2020 16:41:05 +0100 Subject: [PATCH 052/119] sav --- src/fuse.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/fuse.cpp b/src/fuse.cpp index ee99f07a..32c6d272 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -416,10 +416,14 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) //fuse option variable to send to fuse argv_fuse[fi->argc_fuse++] = (char *) "fastafs"; // becomes fuse.fastafs + printf("checkpoint a\n"); + std::vector fuse_options = {}; // those that need to be appended later char current_argument = '\0';// could be o for '-o', etc. + + std::vector full_args = {}; for(signed int i = 0; i < argc; ++i) { printf("processing argv[%i] = '%s' [current argument=%i]\n", i, argv[i], (int) current_argument); @@ -468,8 +472,14 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) } } - if(full_args.size() >= 2) { + printf("checkpoint b\n"); + + + if(full_args.size() > 2) { + printf("checkpoint c\n"); + printf("full_args.size() = %i\n", full_args.size()); int mount_target_arg = full_args[full_args.size() - 2 ]; // last two arguments are and , location to last 2 args not starting with --/- are in this vector + printf("out of bound???\n"); if(fi->from_fastafs) { database d = database(); @@ -507,6 +517,8 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) } } + printf("checkpoint c\n"); + return fi; } @@ -515,10 +527,14 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) void fuse(int argc, char *argv[]) { + printf("wake up\n"); + // part 1 - rewrite args because "fastafs" "mount" is considered as two args, crashing fuse_init // - @todo at some point define that second mount is not really important? if possible char *argv2[argc]; fuse_instance *ffi = parse_args(argc, argv, argv2); + + printf("checkpoint\n"); // part 2 - print what the planning is char cur_time[100]; From 18d2e3ac1e2bf16584b95d74f20ea7df439e4259 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Fri, 10 Jan 2020 11:13:49 +0100 Subject: [PATCH 053/119] sav --- src/lsfastafs.cpp | 3 +-- src/utils.cpp | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/lsfastafs.cpp b/src/lsfastafs.cpp index 2c3f54da..aae91711 100644 --- a/src/lsfastafs.cpp +++ b/src/lsfastafs.cpp @@ -47,7 +47,6 @@ std::unordered_multimap > get_f fprintf(stdout, "Could not open /proc/mounts - are you sure this is running on linux?\n"); } do { - match = fscanf(f, "%255s %255s %255s %255s %d %d\n", mount_dev, mount_dir, mount_type, mount_opts, &mount_freq, &mount_passno); mount_dev[255] = 0; mount_dir[255] = 0; @@ -65,7 +64,6 @@ std::unordered_multimap > get_f //std::cout << "basename: " << basename << "\n"; std::string dict_fn = std::string(mount_dir) + "/" + basename + ".dict"; - if(getxattr(mount_dir, FASTAFS_FILE_XATTR_NAME.c_str(), xattr_fastafs_file, 255) != -1 && getxattr(mount_dir, FASTAFS_PID_XATTR_NAME.c_str(), xattr_fastafs_pid, 255) != -1 @@ -77,6 +75,7 @@ std::unordered_multimap > get_f } } + // else: line did not contain fastafs mount point } while(match != EOF); fclose(f); diff --git a/src/utils.cpp b/src/utils.cpp index 2e8b916e..2d073b81 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -187,8 +187,8 @@ std::string basename_cpp(std::string fn) //std::cout << "basenamepp: |" << filenamepp << "|\n"; std::string filename_cpp = std::string(filename); - delete[] ts; - delete[] filename; + //delete[] ts; + //delete[] filename; // deleting these affects the std::string somehow return filename_cpp; } From f23dc292c1ccbd4de6e5b7774fdc040194112fb6 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Fri, 10 Jan 2020 14:47:39 +0100 Subject: [PATCH 054/119] revert back to c++14 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 29d60522..009e8663 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,7 +75,7 @@ add_custom_target(tidy DEPENDS make_tidy ) add_subdirectory(src) include_directories(include) -add_definitions(-std=c++17) +add_definitions(-std=c++14) # Boost find_package(Boost COMPONENTS unit_test_framework REQUIRED) From 22c43236fd1aacaccac2fd5e5f1aff6b68be88b4 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 14 Jan 2020 09:44:37 +0100 Subject: [PATCH 055/119] meson --- .gitignore | 2 ++ build-release-meson.sh | 5 +++++ meson.build | 14 ++++++++++++++ 3 files changed, 21 insertions(+) create mode 100755 build-release-meson.sh create mode 100644 meson.build diff --git a/.gitignore b/.gitignore index 8dc6ef5f..2a0f03dd 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ build/ xcheck.sh *.fa.fai *.o +/bin-meson +/build-meson diff --git a/build-release-meson.sh b/build-release-meson.sh new file mode 100755 index 00000000..b76c7eb2 --- /dev/null +++ b/build-release-meson.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +meson bin-meson +cd bin-meson +ninja diff --git a/meson.build b/meson.build new file mode 100644 index 00000000..2c2f86c3 --- /dev/null +++ b/meson.build @@ -0,0 +1,14 @@ +project('fastafs', 'cpp') + +src = ['./src/fasta_to_fourbit_fastafs.cpp', './src/fasta_to_twobit_fastafs.cpp', './src/flags.cpp', './src/fourbit_byte.cpp', './src/twobit_byte.cpp', './src/ucsc2bit.cpp', './src/ucsc2bit_to_fastafs.cpp', './src/fastafs.cpp', './src/fuse.cpp', './src/utils.cpp', './src/database.cpp', './src/lsfastafs.cpp', './src/main.cpp'] + +incdir = include_directories('include') + +fuse = dependency('fuse') +crypto = dependency('libcrypto') +openssl = dependency('openssl') + +executable('fastafs', src, + include_directories : incdir, + dependencies: [crypto, openssl, fuse]) + From aa3907e42b4c3b63acdebd05b944eff7782780ec Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 14 Jan 2020 09:46:50 +0100 Subject: [PATCH 056/119] sav --- meson.build | 3 +++ 1 file changed, 3 insertions(+) diff --git a/meson.build b/meson.build index 2c2f86c3..d1eb23bf 100644 --- a/meson.build +++ b/meson.build @@ -1,5 +1,8 @@ project('fastafs', 'cpp') +add_global_arguments('-O3', language : 'cpp') +add_global_arguments('-D_FILE_OFFSET_BITS=64', language : 'cpp') + src = ['./src/fasta_to_fourbit_fastafs.cpp', './src/fasta_to_twobit_fastafs.cpp', './src/flags.cpp', './src/fourbit_byte.cpp', './src/twobit_byte.cpp', './src/ucsc2bit.cpp', './src/ucsc2bit_to_fastafs.cpp', './src/fastafs.cpp', './src/fuse.cpp', './src/utils.cpp', './src/database.cpp', './src/lsfastafs.cpp', './src/main.cpp'] incdir = include_directories('include') From a4349353a52b830b51d6bc87fee71bd5666c5684 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 14 Jan 2020 09:51:14 +0100 Subject: [PATCH 057/119] sav --- meson.build | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/meson.build b/meson.build index d1eb23bf..0f6c43b9 100644 --- a/meson.build +++ b/meson.build @@ -1,4 +1,6 @@ -project('fastafs', 'cpp') +project('fastafs', 'cpp', + version : '1.7.0', default_options : ['warning_level=3', 'cpp_std=c++14']) + add_global_arguments('-O3', language : 'cpp') add_global_arguments('-D_FILE_OFFSET_BITS=64', language : 'cpp') From a36ddc4b4b10e98c8abb18ec15d9a686b3c16344 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 14 Jan 2020 10:15:33 +0100 Subject: [PATCH 058/119] meson --- meson.build | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/meson.build b/meson.build index 0f6c43b9..905aa7b6 100644 --- a/meson.build +++ b/meson.build @@ -1,12 +1,30 @@ project('fastafs', 'cpp', version : '1.7.0', default_options : ['warning_level=3', 'cpp_std=c++14']) - add_global_arguments('-O3', language : 'cpp') add_global_arguments('-D_FILE_OFFSET_BITS=64', language : 'cpp') + +# make config: +# prefix = get_option('prefix') +# https://mesonbuild.com/Configuration.html#a-full-example +conf_data = configuration_data() +conf_data.set('PACKAGE', '"fastafs"') +conf_data.set('DEBUG', 'false') +configure_file(input : 'include/config.hpp.in', + output : 'config.hpp', + configuration : conf_data) +#configuration_inc = include_directories('include') + + + + + + + src = ['./src/fasta_to_fourbit_fastafs.cpp', './src/fasta_to_twobit_fastafs.cpp', './src/flags.cpp', './src/fourbit_byte.cpp', './src/twobit_byte.cpp', './src/ucsc2bit.cpp', './src/ucsc2bit_to_fastafs.cpp', './src/fastafs.cpp', './src/fuse.cpp', './src/utils.cpp', './src/database.cpp', './src/lsfastafs.cpp', './src/main.cpp'] + incdir = include_directories('include') fuse = dependency('fuse') From 23ca9d1a1f5971c601c6b08387f786c2ac35dc0d Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 14 Jan 2020 21:22:26 +0100 Subject: [PATCH 059/119] smallmesonchange --- meson.build | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/meson.build b/meson.build index 905aa7b6..6613a894 100644 --- a/meson.build +++ b/meson.build @@ -1,5 +1,8 @@ project('fastafs', 'cpp', - version : '1.7.0', default_options : ['warning_level=3', 'cpp_std=c++14']) + version : run_command('bash', '-c' , 'grep PROJECT_VERSION CMakeLists.txt | grep -Po \'".+"\' | grep -Po \'[^"]+\'').stdout().strip(), default_options : ['warning_level=3', 'cpp_std=c++14']) + +project_version = run_command('bash', '-c' , 'grep PROJECT_VERSION CMakeLists.txt | grep -Po \'".+"\' | grep -Po \'[^"]+\'').stdout().strip() + add_global_arguments('-O3', language : 'cpp') add_global_arguments('-D_FILE_OFFSET_BITS=64', language : 'cpp') @@ -9,7 +12,8 @@ add_global_arguments('-D_FILE_OFFSET_BITS=64', language : 'cpp') # prefix = get_option('prefix') # https://mesonbuild.com/Configuration.html#a-full-example conf_data = configuration_data() -conf_data.set('PACKAGE', '"fastafs"') +conf_data.set('PROJECT_VERSION', project_version) +conf_data.set('CMAKE_PROJECT_NAME', 'fastafs') conf_data.set('DEBUG', 'false') configure_file(input : 'include/config.hpp.in', output : 'config.hpp', From fa0cc21294e731184712b2b5ad04fc2069fe1846 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 13:44:20 +0100 Subject: [PATCH 060/119] sav --- doc/FASTAFS-FORMAT-SPECIFICATION.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/FASTAFS-FORMAT-SPECIFICATION.md b/doc/FASTAFS-FORMAT-SPECIFICATION.md index bfdf4810..cf1ce0d8 100644 --- a/doc/FASTAFS-FORMAT-SPECIFICATION.md +++ b/doc/FASTAFS-FORMAT-SPECIFICATION.md @@ -50,7 +50,7 @@ If this metadata would be written in the header located before the sequence data | | METADATA-TYPE-FLAG | 2 bytes | | | ENTRY | type specific, examples below: | | | => ORIGINAL PADDING | uint32_t as [4-byte integer](#four-byte-integer) | The number of nucleotides per line in the original FASTA file | - +| CRC32 | Checksum on entire file | 4 bytes | To ensure whole file integrity | ### GENERIC-HEADER ### From 957e6c139557015970867cfac1196211c75db7f9 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 14:50:36 +0100 Subject: [PATCH 061/119] adds crc32 --- CMakeLists.txt | 2 ++ src/fasta_to_twobit_fastafs.cpp | 34 ++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 009e8663..301b1294 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,9 +87,11 @@ else() include_directories(${Boost_INCLUDE_DIRS}) endif() + link_libraries(ssl) link_libraries(crypto) link_libraries(fuse) +link_libraries(z)# zlib; -lz; for crc32 checks on whole file integrity if(DEBUG) diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index 9f428fb2..cb0b6e26 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -1,5 +1,6 @@ #include #include +#include "zlib.h" #include "config.hpp" @@ -319,10 +320,41 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f // calc written size fh_fastafs.seekp(0, std::ios::end); - size_t written = fh_fastafs.tellp(); fh_fasta.close(); + + // + // now calculate crc32 checksum, as all bits have been set. + std::ifstream fh_fastafs_crc(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); + fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway + + uLong crc = crc32(0L, Z_NULL, 0); + + bool terminate = false; + bool togo = true; + while(togo) + { + if(!fh_fastafs_crc.read(buffer, 4)) { + terminate = true; + } + //printf("alive [%i]\n", fh_fastafs_crc.gcount()); + printf("--\n"); + crc = crc32(crc, (const Bytef*)& buffer, fh_fastafs_crc.gcount()); + + if(terminate) { + togo = false; + } + }; + // -- + + //write crc + char byte_enc[5]; + uint_to_fourbytes(byte_enc, (uint32_t) crc); + fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); + + size_t written = fh_fastafs.tellp(); fh_fastafs.close(); + return written; } From 49110232f2681865b7592236f7c687ace2fd1d2c Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 15:55:13 +0100 Subject: [PATCH 062/119] crc32 --- include/fastafs.hpp | 4 +- src/fasta_to_twobit_fastafs.cpp | 2 +- src/fastafs.cpp | 100 ++++++++++++++++++++++++++++---- src/main.cpp | 7 ++- test/fastafs/test_fastafs.cpp | 2 +- 5 files changed, 100 insertions(+), 15 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 51ac1515..10a166b0 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -123,6 +123,7 @@ class fastafs uint32_t view_ucsc2bit_chunk(char *, size_t, off_t); size_t view_dict_chunk(char *, size_t, off_t); + size_t fastafs_filesize(void); size_t fasta_filesize(uint32_t); size_t ucsc2bit_filesize(void); size_t dict_filesize(void); @@ -130,7 +131,8 @@ class fastafs std::string get_faidx(uint32_t);//@todo get rid of this, make it full chunked int info(bool); - int check_integrity(void); + bool check_file_integrity(void); + bool check_sequence_integrity(void); }; diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index cb0b6e26..4efbf673 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -338,7 +338,6 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f terminate = true; } //printf("alive [%i]\n", fh_fastafs_crc.gcount()); - printf("--\n"); crc = crc32(crc, (const Bytef*)& buffer, fh_fastafs_crc.gcount()); if(terminate) { @@ -352,6 +351,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f uint_to_fourbytes(byte_enc, (uint32_t) crc); fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); + // finalize file size_t written = fh_fastafs.tellp(); fh_fastafs.close(); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 4cf2632d..d0f4fba9 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -25,6 +25,7 @@ #include #include #include +#include // crc32 #include "config.hpp" @@ -1146,6 +1147,40 @@ size_t fastafs::view_dict_chunk(char *buffer, size_t buffer_size, off_t file_off +//@todo add unit tests +size_t fastafs::fastafs_filesize(void) +{ + // header + n-sequences + size_t n = 4 + 4 + 2 + 4; + + // number sequences + n += 4; + + // per sequence + for(uint32_t i = 0; i < this->data.size(); i++) { + n += 2;// flags + n += 1; // name length + n += this->data[i]->name.size();// name + n += 4; // reference to compr. data + + // compr dataa + n += 4 + 4 + 4;// compressed nuc. + n blocks + m blocks + n += this->data[i]->n_twobits(); + n += this->data[i]->n_starts.size() * 8; + n += 16;//md5 sum, always present? + n += this->data[i]->m_starts.size() * 8; + } + + // metadata + n += 1; // @ todo more sophi. + + // crc32 + n += 4; + + return n; +} + + size_t fastafs::fasta_filesize(uint32_t padding) { size_t n = 0; @@ -1393,14 +1428,65 @@ int fastafs::info(bool ena_verify_checksum) return 0; } +//true = integer +//false = corrupt +bool fastafs::check_file_integrity() +{ + if(this->filename.size() == 0) { + throw std::invalid_argument("No filename found"); + } + + // starts at 4th + uint32_t n_bytes = this->fastafs_filesize(); + printf("n bytes: %i == 403??\n", n_bytes); + n_bytes -= 4; // position where crc32 should start - may actually be absent if conversion crashed(!) + + + + + bool retcode = true; + char buffer[5]; + + + // now calculate crc32 checksum, as all bits have been set. + std::ifstream fh_fastafs_crc(this->filename.c_str(), std::ios :: out | std::ios :: binary); + fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway + + uLong crc = crc32(0L, Z_NULL, 0); + + bool terminate = false; + bool togo = true; + while(togo) + { + if(!fh_fastafs_crc.read(buffer, 4)) { + terminate = true; + } + //printf("alive [%i]\n", fh_fastafs_crc.gcount()); + //printf("--\n"); + crc = crc32(crc, (const Bytef*)& buffer, fh_fastafs_crc.gcount()); + + if(terminate) { + togo = false; + } + }; + + char byte_enc[5]; + uint_to_fourbytes(byte_enc, (uint32_t) crc); + + return retcode; +} + -int fastafs::check_integrity() +//true = integer +//false = corrupt +bool fastafs::check_sequence_integrity() { if(this->filename.size() == 0) { throw std::invalid_argument("No filename found"); } - int retcode = 0; + bool retcode = true; + char md5_hash[32 + 1] = ""; md5_hash[32] = '\0'; std::string old_hash; @@ -1413,20 +1499,12 @@ int fastafs::check_integrity() md5_digest_to_hash(this->data[i]->md5_digest, md5_hash); old_hash = std::string(md5_hash); - /* - * const uint32_t padding;// padding used for this sequence, cannot be 0 - const uint32_t total_sequence_containing_lines;// calculate total number of full nucleotide lines: (this->n + padding - 1) / padding - - std::vector n_starts; - std::vector n_ends; - * */ - std::string new_hash = this->data[i]->md5(cache->sequences[i], &file); if(old_hash.compare(new_hash) == 0) { printf("OK\t%s\n", this->data[i]->name.c_str()); } else { printf("ERROR\t%s\t%s != %s\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); - retcode = EIO; + retcode = false; } } file.close(); diff --git a/src/main.cpp b/src/main.cpp index 8e17c870..82356efd 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -253,7 +253,12 @@ int main(int argc, char *argv[]) fastafs f = fastafs(std::string(argv[argc - 1])); f.load(fname); - return f.check_integrity(); + if(f.check_file_integrity() and f.check_sequence_integrity()) { + return 0; + } + else { + return EIO; + } } else { usage_check(); } diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index 364dbd6e..dbe06fd3 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -235,7 +235,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1b) std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); BOOST_REQUIRE(file.is_open()); - BOOST_CHECK_EQUAL(fs.check_integrity(), 0); + BOOST_CHECK_EQUAL(fs.check_sequence_integrity(), 0); } From db01dcdc3a824533d376c33212b9b4bfaff19034 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 16:08:15 +0100 Subject: [PATCH 063/119] asd --- src/fastafs.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index d0f4fba9..cb1984e8 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1437,22 +1437,26 @@ bool fastafs::check_file_integrity() } // starts at 4th - uint32_t n_bytes = this->fastafs_filesize(); - printf("n bytes: %i == 403??\n", n_bytes); - n_bytes -= 4; // position where crc32 should start - may actually be absent if conversion crashed(!) - - - + uint32_t bytes_to_read = this->fastafs_filesize() -4 - 4 ; + + uLong crc = crc32(0L, Z_NULL, 0); - bool retcode = true; char buffer[5]; // now calculate crc32 checksum, as all bits have been set. std::ifstream fh_fastafs_crc(this->filename.c_str(), std::ios :: out | std::ios :: binary); fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway - - uLong crc = crc32(0L, Z_NULL, 0); + + while(bytes_to_read > 0) { + printf("still to read: %i\n", bytes_to_read); + + + bytes_to_read -= std::min( (uint32_t) 4, bytes_to_read); + printf(" - now reading: 4 \n", bytes_to_read); + printf(" - still remaining: %i\n", bytes_to_read); + } + bool terminate = false; bool togo = true; @@ -1470,10 +1474,11 @@ bool fastafs::check_file_integrity() } }; - char byte_enc[5]; + char byte_enc[5] = "\x00\x01\x02\x00"; uint_to_fourbytes(byte_enc, (uint32_t) crc); + printf("[%i][%i][%i][%i]\n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); - return retcode; + return true; } From 8c89e256177f6358591dbc73886325d04946e4dd Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 16:35:22 +0100 Subject: [PATCH 064/119] equal crc32 --- src/fasta_to_twobit_fastafs.cpp | 11 ++++++++-- src/fastafs.cpp | 37 ++++++++++++++++----------------- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index 4efbf673..cb92f283 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -328,6 +328,9 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f std::ifstream fh_fastafs_crc(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway + uint32_t nnn = 0; + uint32_t iii; + uLong crc = crc32(0L, Z_NULL, 0); bool terminate = false; @@ -338,17 +341,21 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f terminate = true; } //printf("alive [%i]\n", fh_fastafs_crc.gcount()); - crc = crc32(crc, (const Bytef*)& buffer, fh_fastafs_crc.gcount()); + iii = fh_fastafs_crc.gcount(); + crc = crc32(crc, (const Bytef*)& buffer, iii); + nnn += iii; if(terminate) { togo = false; } }; // -- + printf("nnn = %i\n",nnn); //write crc - char byte_enc[5]; + char byte_enc[5] = "\x00\x00\x00\x00"; uint_to_fourbytes(byte_enc, (uint32_t) crc); + printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); // finalize file diff --git a/src/fastafs.cpp b/src/fastafs.cpp index cb1984e8..b9df63f3 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1442,7 +1442,8 @@ bool fastafs::check_file_integrity() uLong crc = crc32(0L, Z_NULL, 0); char buffer[5]; - + uint32_t nnn = 0; + uint32_t iii; // now calculate crc32 checksum, as all bits have been set. std::ifstream fh_fastafs_crc(this->filename.c_str(), std::ios :: out | std::ios :: binary); @@ -1452,27 +1453,25 @@ bool fastafs::check_file_integrity() printf("still to read: %i\n", bytes_to_read); - bytes_to_read -= std::min( (uint32_t) 4, bytes_to_read); - printf(" - now reading: 4 \n", bytes_to_read); - printf(" - still remaining: %i\n", bytes_to_read); - } - - - bool terminate = false; - bool togo = true; - while(togo) - { - if(!fh_fastafs_crc.read(buffer, 4)) { - terminate = true; - } - //printf("alive [%i]\n", fh_fastafs_crc.gcount()); - //printf("--\n"); + iii = std::min( (uint32_t) 4, bytes_to_read) ; + fh_fastafs_crc.read(buffer, iii); + bytes_to_read -= iii; + printf(" - now reading: %i \n", iii ); crc = crc32(crc, (const Bytef*)& buffer, fh_fastafs_crc.gcount()); - if(terminate) { - togo = false; + /* + if(fh_fastafs_crc.read(buffer, std::min( (uint32_t) 4, bytes_to_read))) { + } - }; + else { + bytes_to_read = 0;// unexpected EOF + } + */ + //bytes_to_read -= ; + printf(" - still remaining: %i\n", bytes_to_read); + } + + char byte_enc[5] = "\x00\x01\x02\x00"; uint_to_fourbytes(byte_enc, (uint32_t) crc); From 50269428aa0afe318cec0472c5f7ffe598278e71 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 16:44:51 +0100 Subject: [PATCH 065/119] sav --- Changelog | 6 ++++++ meson.build | 3 ++- src/fastafs.cpp | 24 ++++++------------------ 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/Changelog b/Changelog index 7d6166b6..eb55a392 100644 --- a/Changelog +++ b/Changelog @@ -1,3 +1,9 @@ +2012-01-15 Youri Hoogstrate + + * v1.7.0 + * CRC32 checksums for file integratity + * converting to meson because of insane build times using cmake+make and re-building files that have not changed + 2019-09-06 Youri Hoogstrate * v1.6.2 diff --git a/meson.build b/meson.build index 6613a894..649fb18b 100644 --- a/meson.build +++ b/meson.build @@ -34,8 +34,9 @@ incdir = include_directories('include') fuse = dependency('fuse') crypto = dependency('libcrypto') openssl = dependency('openssl') +zlib = dependency('zlib') executable('fastafs', src, include_directories : incdir, - dependencies: [crypto, openssl, fuse]) + dependencies: [crypto, openssl, fuse, zlib]) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index b9df63f3..e4507f9d 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1441,8 +1441,8 @@ bool fastafs::check_file_integrity() uLong crc = crc32(0L, Z_NULL, 0); - char buffer[5]; - uint32_t nnn = 0; + const int buffer_size = 4; + char buffer[buffer_size + 1]; uint32_t iii; // now calculate crc32 checksum, as all bits have been set. @@ -1450,25 +1450,13 @@ bool fastafs::check_file_integrity() fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway while(bytes_to_read > 0) { - printf("still to read: %i\n", bytes_to_read); - - - iii = std::min( (uint32_t) 4, bytes_to_read) ; + //printf("still to read: %i\n", bytes_to_read); + + iii = std::min( (uint32_t) buffer_size, bytes_to_read) ; fh_fastafs_crc.read(buffer, iii); bytes_to_read -= iii; - printf(" - now reading: %i \n", iii ); + crc = crc32(crc, (const Bytef*)& buffer, fh_fastafs_crc.gcount()); - - /* - if(fh_fastafs_crc.read(buffer, std::min( (uint32_t) 4, bytes_to_read))) { - - } - else { - bytes_to_read = 0;// unexpected EOF - } - */ - //bytes_to_read -= ; - printf(" - still remaining: %i\n", bytes_to_read); } From 139d53f8e9d7d3306c4ac933b30b82f0f15820f7 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 16:47:47 +0100 Subject: [PATCH 066/119] sav --- src/fastafs.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index e4507f9d..202e2f93 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1437,30 +1437,30 @@ bool fastafs::check_file_integrity() } // starts at 4th - uint32_t bytes_to_read = this->fastafs_filesize() -4 - 4 ; + uint32_t total_bytes_to_be_read = this->fastafs_filesize() -4 - 4 ; uLong crc = crc32(0L, Z_NULL, 0); const int buffer_size = 4; char buffer[buffer_size + 1]; - uint32_t iii; + + uint32_t bytes_to_be_read_this_iter; + uint32_t bytes_actually_read_this_iter; // now calculate crc32 checksum, as all bits have been set. std::ifstream fh_fastafs_crc(this->filename.c_str(), std::ios :: out | std::ios :: binary); fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway - while(bytes_to_read > 0) { - //printf("still to read: %i\n", bytes_to_read); - - iii = std::min( (uint32_t) buffer_size, bytes_to_read) ; - fh_fastafs_crc.read(buffer, iii); - bytes_to_read -= iii; + while(total_bytes_to_be_read > 0) { + bytes_to_be_read_this_iter = std::min( (uint32_t) buffer_size, total_bytes_to_be_read) ; + fh_fastafs_crc.read(buffer, bytes_to_be_read_this_iter); + total_bytes_to_be_read -= bytes_to_be_read_this_iter; - crc = crc32(crc, (const Bytef*)& buffer, fh_fastafs_crc.gcount()); + bytes_actually_read_this_iter = fh_fastafs_crc.gcount(); + crc = crc32(crc, (const Bytef*)& buffer, bytes_actually_read_this_iter); } - char byte_enc[5] = "\x00\x01\x02\x00"; uint_to_fourbytes(byte_enc, (uint32_t) crc); printf("[%i][%i][%i][%i]\n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); From 1269e3e6d48dfdbf9dbe273b77365daf83063feb Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 16:48:52 +0100 Subject: [PATCH 067/119] sav --- src/fastafs.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 202e2f93..a1607b37 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1457,7 +1457,12 @@ bool fastafs::check_file_integrity() total_bytes_to_be_read -= bytes_to_be_read_this_iter; bytes_actually_read_this_iter = fh_fastafs_crc.gcount(); - crc = crc32(crc, (const Bytef*)& buffer, bytes_actually_read_this_iter); + if(bytes_actually_read_this_iter == 0) { + total_bytes_to_be_read = 0; + } + else { + crc = crc32(crc, (const Bytef*)& buffer, bytes_actually_read_this_iter); + } } From 617b9c23adc5f8bfef27f14e94466341144d8c55 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 16:57:58 +0100 Subject: [PATCH 068/119] sav --- bin/.gitignore | 1 - include/fastafs.hpp | 2 ++ src/fastafs.cpp | 46 ++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 4 deletions(-) delete mode 100644 bin/.gitignore diff --git a/bin/.gitignore b/bin/.gitignore deleted file mode 100644 index 9f8bdef4..00000000 --- a/bin/.gitignore +++ /dev/null @@ -1 +0,0 @@ -fastafs diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 10a166b0..8a72242d 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -108,6 +108,7 @@ class fastafs std::string name; std::string filename; std::vector data; + uint32_t crc32f;// crc32 as found in fastafs file fastafs_flags flags; @@ -123,6 +124,7 @@ class fastafs uint32_t view_ucsc2bit_chunk(char *, size_t, off_t); size_t view_dict_chunk(char *, size_t, off_t); + uint32_t get_crc32(void);// returns a 'new' crc32, estimated on file contents size_t fastafs_filesize(void); size_t fasta_filesize(uint32_t); size_t ucsc2bit_filesize(void); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index a1607b37..0cf27713 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1428,6 +1428,47 @@ int fastafs::info(bool ena_verify_checksum) return 0; } + +// skips first four bytes and does not include crc32 at the end either +uint32_t fastafs::get_crc32(void) +{ + if(this->filename.size() == 0) { + throw std::invalid_argument("No filename found"); + } + + // starts at 4th + uint32_t total_bytes_to_be_read = this->fastafs_filesize() - 4 - 4 ; + + uLong crc = crc32(0L, Z_NULL, 0); + + const int buffer_size = 4; + char buffer[buffer_size + 1]; + + uint32_t bytes_to_be_read_this_iter; + uint32_t bytes_actually_read_this_iter; + + // now calculate crc32 checksum, as all bits have been set. + std::ifstream fh_fastafs_crc(this->filename.c_str(), std::ios :: out | std::ios :: binary); + fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway + + while(total_bytes_to_be_read > 0) { + bytes_to_be_read_this_iter = std::min( (uint32_t) buffer_size, total_bytes_to_be_read) ; + fh_fastafs_crc.read(buffer, bytes_to_be_read_this_iter); + total_bytes_to_be_read -= bytes_to_be_read_this_iter; + + bytes_actually_read_this_iter = fh_fastafs_crc.gcount(); + if(bytes_actually_read_this_iter == 0) { + total_bytes_to_be_read = 0; // unexpected eof? + } + else { + crc = crc32(crc, (const Bytef*)& buffer, bytes_actually_read_this_iter); + } + } + + return crc; +} + + //true = integer //false = corrupt bool fastafs::check_file_integrity() @@ -1437,8 +1478,8 @@ bool fastafs::check_file_integrity() } // starts at 4th - uint32_t total_bytes_to_be_read = this->fastafs_filesize() -4 - 4 ; - + uint32_t total_bytes_to_be_read = this->fastafs_filesize() - 4 - 4 ; + uLong crc = crc32(0L, Z_NULL, 0); const int buffer_size = 4; @@ -1465,7 +1506,6 @@ bool fastafs::check_file_integrity() } } - char byte_enc[5] = "\x00\x01\x02\x00"; uint_to_fourbytes(byte_enc, (uint32_t) crc); printf("[%i][%i][%i][%i]\n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); From 71e5ce2d34774dd834436ffcfe2966b9257d2940 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 17:04:55 +0100 Subject: [PATCH 069/119] av --- src/fasta_to_twobit_fastafs.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index cb92f283..a0c7de2b 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -4,6 +4,7 @@ #include "config.hpp" +#include "fastafs.hpp" #include "fasta_to_twobit_fastafs.hpp" #include "flags.hpp" #include "utils.hpp" @@ -325,6 +326,18 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f // // now calculate crc32 checksum, as all bits have been set. + + fastafs f(""); + f.load(fastafs_file); + uint32_t crc32c = f.get_crc32(); + + char byte_enc[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(byte_enc, (uint32_t) crc32c); + printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); + + + /* std::ifstream fh_fastafs_crc(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway @@ -353,15 +366,17 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f printf("nnn = %i\n",nnn); //write crc - char byte_enc[5] = "\x00\x00\x00\x00"; uint_to_fourbytes(byte_enc, (uint32_t) crc); printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); - + */ + // finalize file size_t written = fh_fastafs.tellp(); fh_fastafs.close(); + + return written; } From a7f6b53f9b969a26f71447da695ef3247f147b2f Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 17:05:57 +0100 Subject: [PATCH 070/119] sav --- src/fasta_to_twobit_fastafs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index a0c7de2b..6fb8e89a 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -333,7 +333,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f char byte_enc[5] = "\x00\x00\x00\x00"; uint_to_fourbytes(byte_enc, (uint32_t) crc32c); - printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + //printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); From c68827c9e1e60cd1a11e417f4d321a191f83f338 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 17:33:14 +0100 Subject: [PATCH 071/119] sav --- src/fastafs.cpp | 75 +++++++++++++++++++++++++++---------------------- src/main.cpp | 4 ++- 2 files changed, 45 insertions(+), 34 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 0cf27713..93580329 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -508,7 +508,7 @@ bool fastafs_seq::get_n_offset(uint32_t pos, uint32_t *num_Ns) fastafs::fastafs(std::string arg_name) : - name(arg_name) + name(arg_name), crc32f(0) { } @@ -661,6 +661,18 @@ void fastafs::load(std::string afilename) this->data[i] = s; } + // metadata section - empty for now + file.read(memblock, 1); + + // crc32 checksum - may be missing because fastafs::load is also used before fastafs::get_crc32 is ran to obtain the checksum + file.read(memblock, 4); + if(file.gcount() == 4) { + this->crc32f = fourbytes_to_uint(memblock, 0); + } + else { + //printf("crc32 checksum missing\n"); + } + file.close(); delete[] memblock; } @@ -1473,44 +1485,41 @@ uint32_t fastafs::get_crc32(void) //false = corrupt bool fastafs::check_file_integrity() { - if(this->filename.size() == 0) { - throw std::invalid_argument("No filename found"); - } - - // starts at 4th - uint32_t total_bytes_to_be_read = this->fastafs_filesize() - 4 - 4 ; + uint32_t crc32_current = this->get_crc32(); - uLong crc = crc32(0L, Z_NULL, 0); + char buf_old[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(buf_old, (uint32_t) this->crc32f); - const int buffer_size = 4; - char buffer[buffer_size + 1]; + if(crc32_current != this->crc32f) { - uint32_t bytes_to_be_read_this_iter; - uint32_t bytes_actually_read_this_iter; + char buf_new[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(buf_new, (uint32_t) crc32_current); + + printf("ERROR\t%02hhx%02hhx%02hhx%02hhx (in-file) != %02hhx%02hhx%02hhx%02hhx (actual file)\n--\n", + buf_old[0], + buf_old[1], + buf_old[2], + buf_old[3], - // now calculate crc32 checksum, as all bits have been set. - std::ifstream fh_fastafs_crc(this->filename.c_str(), std::ios :: out | std::ios :: binary); - fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway + buf_new[0], + buf_new[1], + buf_new[2], + buf_new[3] - while(total_bytes_to_be_read > 0) { - bytes_to_be_read_this_iter = std::min( (uint32_t) buffer_size, total_bytes_to_be_read) ; - fh_fastafs_crc.read(buffer, bytes_to_be_read_this_iter); - total_bytes_to_be_read -= bytes_to_be_read_this_iter; - - bytes_actually_read_this_iter = fh_fastafs_crc.gcount(); - if(bytes_actually_read_this_iter == 0) { - total_bytes_to_be_read = 0; - } - else { - crc = crc32(crc, (const Bytef*)& buffer, bytes_actually_read_this_iter); - } + ); + } - - char byte_enc[5] = "\x00\x01\x02\x00"; - uint_to_fourbytes(byte_enc, (uint32_t) crc); - printf("[%i][%i][%i][%i]\n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); - - return true; + else { + printf("OK\t%02hhx%02hhx%02hhx%02hhx\n--\n", + buf_old[0], + buf_old[1], + buf_old[2], + buf_old[3] + + ); + } + + return (crc32_current == this->crc32f); } diff --git a/src/main.cpp b/src/main.cpp index 82356efd..d79a759f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -253,7 +253,9 @@ int main(int argc, char *argv[]) fastafs f = fastafs(std::string(argv[argc - 1])); f.load(fname); - if(f.check_file_integrity() and f.check_sequence_integrity()) { + bool check1 = f.check_file_integrity(); + bool check2 = f.check_sequence_integrity(); + if(check1 and check2) { return 0; } else { From 7aee856cac9a30fc321ad6b91668049fd090939d Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 17:45:20 +0100 Subject: [PATCH 072/119] sav --- .gitignore | 2 ++ build-debug.sh | 12 +++++++++--- build-release.sh | 2 +- src/fastafs.cpp | 2 +- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 2a0f03dd..75594e1d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,5 @@ xcheck.sh *.o /bin-meson /build-meson +*.ninja +.ninja* diff --git a/build-debug.sh b/build-debug.sh index 6eb62a47..6c89a14f 100755 --- a/build-debug.sh +++ b/build-debug.sh @@ -1,5 +1,11 @@ #!/bin/bash -cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . -make "$@" -j `nproc` -make install +cmake -GNinja -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . +ninja -j`nproc` +ninja install + + +# using make - much slower +#cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . +#make "$@" -j `nproc` +#make install diff --git a/build-release.sh b/build-release.sh index 58278282..aa015011 100755 --- a/build-release.sh +++ b/build-release.sh @@ -1,5 +1,5 @@ #!/bin/bash -cmake -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . +cmake -GNinja -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . make "$@" -j `nproc` sudo make install diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 93580329..8f211678 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1477,7 +1477,7 @@ uint32_t fastafs::get_crc32(void) } } - return crc; + return (uint32_t) crc; } From 2422b94f724bab4d7ce2e3a0a64ab60948bd8e2a Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 18:22:35 +0100 Subject: [PATCH 073/119] sav --- src/fasta_to_fourbit_fastafs.cpp | 54 ++++++++++++++++++++++++++++++- src/fasta_to_twobit_fastafs.cpp | 5 +-- src/fastafs.cpp | 1 + test/cache/test_cache_fourbit.cpp | 15 +++++++-- test/cache/test_cache_twobit.cpp | 27 ++++++++++------ test/fastafs/test_fastafs.cpp | 2 +- 6 files changed, 87 insertions(+), 17 deletions(-) diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index f003dd46..bdc06885 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -1,6 +1,8 @@ #include #include +#include + #include "config.hpp" #include "fasta_to_fourbit_fastafs.hpp" @@ -564,11 +566,61 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string uint_to_fourbytes(buffer, index_file_position);//position of header fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); + fh_fasta.close(); + + + fh_fastafs.seekp(0, std::ios::end); + + printf("file size now: %i\n", fh_fastafs.tellp()); + + + + std::ifstream fh_fastafs_crc(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); + fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway + + uint32_t nnn = 0; + uint32_t iii; + + uLong crc = crc32(0L, Z_NULL, 0); + + bool terminate = false; + bool togo = true; + while(togo) + { + if(!fh_fastafs_crc.read(buffer, 4)) { + terminate = true; + } + //printf("alive [%i]\n", fh_fastafs_crc.gcount()); + iii = fh_fastafs_crc.gcount(); + crc = crc32(crc, (const Bytef*)& buffer, iii); + nnn += iii; + + if(terminate) { + togo = false; + } + }; + // -- + printf("nnn = %i\n",nnn); + + + + //write crc as 4 bytes + char byte_enc[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(byte_enc, (uint32_t) crc); + printf("[%i][%i][%i][%i] ~ %02hhx%02hhx%02hhx%02hhx \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3], + byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); + + + + // calc written size fh_fastafs.seekp(0, std::ios::end); size_t written = fh_fastafs.tellp(); - fh_fasta.close(); + + printf("file size now: %i\n", fh_fastafs.tellp()); + fh_fastafs.close(); return written; diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index 6fb8e89a..ab7c0370 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -319,13 +319,10 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f uint_to_fourbytes(buffer, index_file_position);//position of header fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); - // calc written size - fh_fastafs.seekp(0, std::ios::end); - fh_fasta.close(); - // // now calculate crc32 checksum, as all bits have been set. + fh_fastafs.seekp(0, std::ios::end); fastafs f(""); f.load(fastafs_file); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 8f211678..1e0bc993 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -447,6 +447,7 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) +// @todo make this->n_bits uint32_t fastafs_seq::n_twobits() { // if n actg bits is: diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index fee801f4..a2e028f9 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -5,6 +5,7 @@ #include "config.hpp" #include "fasta_to_fourbit_fastafs.hpp" +//#include "fastafs.hpp" @@ -173,10 +174,13 @@ BOOST_AUTO_TEST_CASE(test_cache) "\x00\x00\x00\x0E"s // [, ] data position in file (14) // METADATA - "\x00" // [120] no metadata fields [padding will come soon?] + "\x00"s // [120] no metadata fields [padding will come soon?] + + // CRC32 + "\x41\x2f\x3c\x72"s ; - BOOST_CHECK_EQUAL(written, 121); + BOOST_CHECK_EQUAL(written, 125); //BOOST_CHECK(output.compare(uppercase) == 0 or output.compare(mixedcase) == 0); std::ifstream file("tmp/test_004.fastafs", std::ios::in | std::ios::binary | std::ios::ate); @@ -206,6 +210,13 @@ BOOST_AUTO_TEST_CASE(test_cache) } delete[] buffer; + + + + // check fastafs filesize + //fastafs f = fastafs(""); + //f.load("tmp/test_004.fastafs"); + //BOOST_CHECK_EQUAL(f.fastafs_filesize() , 125); } diff --git a/test/cache/test_cache_twobit.cpp b/test/cache/test_cache_twobit.cpp index 073f09a6..b0c33d0a 100644 --- a/test/cache/test_cache_twobit.cpp +++ b/test/cache/test_cache_twobit.cpp @@ -4,8 +4,8 @@ #include "config.hpp" -//#include "twobit_byte.hpp" #include "fasta_to_twobit_fastafs.hpp" +//#include "fastafs.hpp" @@ -224,7 +224,7 @@ BOOST_AUTO_TEST_CASE(Test_size) */ BOOST_AUTO_TEST_CASE(test_cache) { - size_t written = fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cachce_test.fastafs"); + size_t written = fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cache_test.fastafs"); static std::string reference = // GENERIC-HEADER @@ -315,13 +315,16 @@ BOOST_AUTO_TEST_CASE(test_cache) "\x00\x00\x01\x0A"s // [, ] data position in file (290) // METADATA - "\x00" // [399] no metadata fields [padding will come soon?] + "\x00"s // [399] no metadata fields [padding will come soon?] + + // CRC32 checksums + "\x1e\x77\x77\x22"s ; - BOOST_CHECK_EQUAL(written, 399); + BOOST_CHECK_EQUAL(written, 403); //BOOST_CHECK(output.compare(uppercase) == 0 or output.compare(mixedcase) == 0); - std::ifstream file("tmp/test_cachce_test.fastafs", std::ios::in | std::ios::binary | std::ios::ate); + std::ifstream file("tmp/test_cache_test.fastafs", std::ios::in | std::ios::binary | std::ios::ate); BOOST_REQUIRE(file.is_open()); std::streampos size; @@ -345,6 +348,12 @@ BOOST_AUTO_TEST_CASE(test_cache) } delete[] buffer; + + + // check computed file size + fastafs f = fastafs(""); + f.load("tmp/test_cache_test.fastafs"); + BOOST_CHECK_EQUAL( f.fastafs_filesize() , 403 ); } @@ -358,11 +367,11 @@ BOOST_AUTO_TEST_CASE(test_cache) BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) { // generate FASTAFS file from FASTA file - fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cachce_test.fastafs"); + fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cache_test.fastafs"); // load the FASTAFS file fastafs f2 = fastafs("test"); - f2.load("tmp/test_cachce_test.fastafs"); + f2.load("tmp/test_cache_test.fastafs"); const uint32_t padding = 60; ffs2f_init* cache_p60_uc = f2.init_ffs2f(padding, false); // upper case @@ -418,11 +427,11 @@ BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) BOOST_AUTO_TEST_CASE(test_cache_with_newlines) { // generate FASTAFS file from FASTA file - fasta_to_twobit_fastafs("test/data/test_003.fa", "tmp/test_cachce_test_003.fastafs"); + fasta_to_twobit_fastafs("test/data/test_003.fa", "tmp/test_cache_test_003.fastafs"); // load the FASTAFS file fastafs f2 = fastafs("test"); - f2.load("tmp/test_cachce_test_003.fastafs"); + f2.load("tmp/test_cache_test_003.fastafs"); const uint32_t padding = 60; ffs2f_init* cache_p60 = f2.init_ffs2f(padding, false); diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index dbe06fd3..5560b98d 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -235,7 +235,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1b) std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); BOOST_REQUIRE(file.is_open()); - BOOST_CHECK_EQUAL(fs.check_sequence_integrity(), 0); + BOOST_CHECK_EQUAL(fs.check_sequence_integrity(), true); } From 900d64ef0bd891f0df50f3243e9e008f55254666 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 18:40:03 +0100 Subject: [PATCH 074/119] sav --- CMakeLists.txt | 3 ++- README.md | 2 +- build-debug.sh | 14 +++++++------- src/ucsc2bit_to_fastafs.cpp | 17 +++++++++++++++-- test/CMakeLists.txt | 2 +- test/cache/test_cache_fourbit.cpp | 6 +++--- .../test_ucsc2bit_to_fastafs.cpp | 2 +- 7 files changed, 30 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 301b1294..2dc7bd52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -172,7 +172,8 @@ add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) # 'make check' as al add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N) | ACUG(N) add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-) add_test(test_cache_twobit "${BUILD_TEST_DIR}/test_cache_twobit") -add_test(test_view "${BUILD_TEST_DIR}/test_view") +add_test(test_cache_fourbit "${BUILD_TEST_DIR}/test_cache_twobit") +#add_test(test_view "${BUILD_TEST_DIR}/test_view") add_test(test_flags "${BUILD_TEST_DIR}/test_flags") add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs") add_test(test_fastafs_as_ucsc2bit "${BUILD_TEST_DIR}/test_fastafs_as_ucsc2bit") diff --git a/README.md b/README.md index 5cff4e4b..77466b2f 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Required dependencies are: - libboost (only for unit testing, will be come an optional dependency soon) - libopenssl (for generating MD5 hashes) - libfuse (for access to the fuse layer system and file virtualization) - - c++ compiler supporting c++-17 + - c++ compiler supporting c++-14 Compilation is done using cmake. The build command to run cmake for common use is: diff --git a/build-debug.sh b/build-debug.sh index 6c89a14f..9f4ab973 100755 --- a/build-debug.sh +++ b/build-debug.sh @@ -1,11 +1,11 @@ #!/bin/bash -cmake -GNinja -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . -ninja -j`nproc` -ninja install +#cmake -GNinja -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . +#ninja -j`nproc` +#ninja install -# using make - much slower -#cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . -#make "$@" -j `nproc` -#make install +## using make - sometimes much slower +cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . +make "$@" -j `nproc` +make install diff --git a/src/ucsc2bit_to_fastafs.cpp b/src/ucsc2bit_to_fastafs.cpp index 7fc95dda..d15033b4 100644 --- a/src/ucsc2bit_to_fastafs.cpp +++ b/src/ucsc2bit_to_fastafs.cpp @@ -263,11 +263,24 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) fh_fastafs.write(reinterpret_cast(&buffer), (size_t) 4); } + fh_ucsc2bit.close(); + fh_fastafs.seekp(0, std::ios::end); - size_t written = fh_fastafs.tellp(); + + fastafs f(""); + f.load(fastafs_file); + uint32_t crc32c = f.get_crc32(); + + char byte_enc[5] = "\x00\x00\x00\x00"; + uint_to_fourbytes(byte_enc, (uint32_t) crc32c); + //printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); + + + + size_t written = fh_fastafs.tellp(); fh_fastafs.close(); - fh_ucsc2bit.close(); return written; } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 04e31b89..a6c92bdd 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -12,7 +12,7 @@ else() endif() -add_definitions(-std=c++17) +add_definitions(-std=c++14) set(BUILD_DIR "../bin") set(BUILD_TEST_DIR "${BUILD_DIR}/test") diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index a2e028f9..14277b50 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -214,9 +214,9 @@ BOOST_AUTO_TEST_CASE(test_cache) // check fastafs filesize - //fastafs f = fastafs(""); - //f.load("tmp/test_004.fastafs"); - //BOOST_CHECK_EQUAL(f.fastafs_filesize() , 125); + fastafs f = fastafs(""); + f.load("tmp/test_004.fastafs"); + BOOST_CHECK_EQUAL(f.fastafs_filesize() , 125); } diff --git a/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp b/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp index 191b4c0f..e29175b9 100644 --- a/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp +++ b/test/ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp @@ -52,7 +52,7 @@ BOOST_AUTO_TEST_CASE(test_ucsc2bit_to_fasta) std::istream_iterator b2(ifs2), e2; BOOST_CHECK_EQUAL_COLLECTIONS(b1, e1, b2, e2); - BOOST_CHECK_EQUAL(written, (size_t) 399); + BOOST_CHECK_EQUAL(written, (size_t) 403); } From 796073206eb60d9c5fe14e7cb58f02142001ca83 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 15 Jan 2020 18:40:44 +0100 Subject: [PATCH 075/119] sav --- CMakeLists.txt | 2 +- src/fasta_to_fourbit_fastafs.cpp | 15 ++++---- src/fasta_to_twobit_fastafs.cpp | 10 +++--- src/fastafs.cpp | 59 +++++++++++++++---------------- src/fuse.cpp | 12 +++---- src/main.cpp | 3 +- src/ucsc2bit_to_fastafs.cpp | 2 +- test/cache/test_cache_fourbit.cpp | 8 ++--- test/cache/test_cache_twobit.cpp | 8 ++--- 9 files changed, 57 insertions(+), 62 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2dc7bd52..af7c0034 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -172,7 +172,7 @@ add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) # 'make check' as al add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N) | ACUG(N) add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-) add_test(test_cache_twobit "${BUILD_TEST_DIR}/test_cache_twobit") -add_test(test_cache_fourbit "${BUILD_TEST_DIR}/test_cache_twobit") +add_test(test_cache_fourbit "${BUILD_TEST_DIR}/test_cache_fourbit") #add_test(test_view "${BUILD_TEST_DIR}/test_view") add_test(test_flags "${BUILD_TEST_DIR}/test_flags") add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs") diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index bdc06885..c5d4e7f6 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -577,16 +577,15 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string std::ifstream fh_fastafs_crc(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway - + uint32_t nnn = 0; uint32_t iii; - + uLong crc = crc32(0L, Z_NULL, 0); - + bool terminate = false; bool togo = true; - while(togo) - { + while(togo) { if(!fh_fastafs_crc.read(buffer, 4)) { terminate = true; } @@ -594,13 +593,13 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string iii = fh_fastafs_crc.gcount(); crc = crc32(crc, (const Bytef*)& buffer, iii); nnn += iii; - + if(terminate) { togo = false; } }; // -- - printf("nnn = %i\n",nnn); + printf("nnn = %i\n", nnn); @@ -608,7 +607,7 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string char byte_enc[5] = "\x00\x00\x00\x00"; uint_to_fourbytes(byte_enc, (uint32_t) crc); printf("[%i][%i][%i][%i] ~ %02hhx%02hhx%02hhx%02hhx \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3], - byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index ab7c0370..b6cf5868 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -333,16 +333,16 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f //printf("[%i][%i][%i][%i] input!! \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); - + /* std::ifstream fh_fastafs_crc(fastafs_file.c_str(), std::ios :: out | std::ios :: binary); fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway - + uint32_t nnn = 0; uint32_t iii; - + uLong crc = crc32(0L, Z_NULL, 0); - + bool terminate = false; bool togo = true; while(togo) @@ -354,7 +354,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f iii = fh_fastafs_crc.gcount(); crc = crc32(crc, (const Bytef*)& buffer, iii); nnn += iii; - + if(terminate) { togo = false; } diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 1e0bc993..fa6291ea 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -669,8 +669,7 @@ void fastafs::load(std::string afilename) file.read(memblock, 4); if(file.gcount() == 4) { this->crc32f = fourbytes_to_uint(memblock, 0); - } - else { + } else { //printf("crc32 checksum missing\n"); } @@ -1183,12 +1182,12 @@ size_t fastafs::fastafs_filesize(void) n += 16;//md5 sum, always present? n += this->data[i]->m_starts.size() * 8; } - + // metadata n += 1; // @ todo more sophi. // crc32 - n += 4; + n += 4; return n; } @@ -1448,8 +1447,8 @@ uint32_t fastafs::get_crc32(void) if(this->filename.size() == 0) { throw std::invalid_argument("No filename found"); } - - // starts at 4th + + // starts at 4th uint32_t total_bytes_to_be_read = this->fastafs_filesize() - 4 - 4 ; uLong crc = crc32(0L, Z_NULL, 0); @@ -1465,15 +1464,14 @@ uint32_t fastafs::get_crc32(void) fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway while(total_bytes_to_be_read > 0) { - bytes_to_be_read_this_iter = std::min( (uint32_t) buffer_size, total_bytes_to_be_read) ; + bytes_to_be_read_this_iter = std::min((uint32_t) buffer_size, total_bytes_to_be_read) ; fh_fastafs_crc.read(buffer, bytes_to_be_read_this_iter); total_bytes_to_be_read -= bytes_to_be_read_this_iter; bytes_actually_read_this_iter = fh_fastafs_crc.gcount(); if(bytes_actually_read_this_iter == 0) { total_bytes_to_be_read = 0; // unexpected eof? - } - else { + } else { crc = crc32(crc, (const Bytef*)& buffer, bytes_actually_read_this_iter); } } @@ -1494,32 +1492,31 @@ bool fastafs::check_file_integrity() if(crc32_current != this->crc32f) { char buf_new[5] = "\x00\x00\x00\x00"; - uint_to_fourbytes(buf_new, (uint32_t) crc32_current); - + uint_to_fourbytes(buf_new, (uint32_t) crc32_current); + printf("ERROR\t%02hhx%02hhx%02hhx%02hhx (in-file) != %02hhx%02hhx%02hhx%02hhx (actual file)\n--\n", - buf_old[0], - buf_old[1], - buf_old[2], - buf_old[3], - - buf_new[0], - buf_new[1], - buf_new[2], - buf_new[3] - - ); - - } - else { + buf_old[0], + buf_old[1], + buf_old[2], + buf_old[3], + + buf_new[0], + buf_new[1], + buf_new[2], + buf_new[3] + + ); + + } else { printf("OK\t%02hhx%02hhx%02hhx%02hhx\n--\n", - buf_old[0], - buf_old[1], - buf_old[2], - buf_old[3] + buf_old[0], + buf_old[1], + buf_old[2], + buf_old[3] - ); + ); } - + return (crc32_current == this->crc32f); } diff --git a/src/fuse.cpp b/src/fuse.cpp index 32c6d272..39a78508 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -416,7 +416,7 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) //fuse option variable to send to fuse argv_fuse[fi->argc_fuse++] = (char *) "fastafs"; // becomes fuse.fastafs - printf("checkpoint a\n"); + printf("checkpoint a\n"); std::vector fuse_options = {}; // those that need to be appended later @@ -472,12 +472,12 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) } } - printf("checkpoint b\n"); + printf("checkpoint b\n"); if(full_args.size() > 2) { - printf("checkpoint c\n"); - printf("full_args.size() = %i\n", full_args.size()); + printf("checkpoint c\n"); + printf("full_args.size() = %i\n", full_args.size()); int mount_target_arg = full_args[full_args.size() - 2 ]; // last two arguments are and , location to last 2 args not starting with --/- are in this vector printf("out of bound???\n"); @@ -517,7 +517,7 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) } } - printf("checkpoint c\n"); + printf("checkpoint c\n"); return fi; } @@ -533,7 +533,7 @@ void fuse(int argc, char *argv[]) // - @todo at some point define that second mount is not really important? if possible char *argv2[argc]; fuse_instance *ffi = parse_args(argc, argv, argv2); - + printf("checkpoint\n"); // part 2 - print what the planning is diff --git a/src/main.cpp b/src/main.cpp index d79a759f..02e75c35 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -257,8 +257,7 @@ int main(int argc, char *argv[]) bool check2 = f.check_sequence_integrity(); if(check1 and check2) { return 0; - } - else { + } else { return EIO; } } else { diff --git a/src/ucsc2bit_to_fastafs.cpp b/src/ucsc2bit_to_fastafs.cpp index d15033b4..87cb267c 100644 --- a/src/ucsc2bit_to_fastafs.cpp +++ b/src/ucsc2bit_to_fastafs.cpp @@ -268,7 +268,7 @@ size_t ucsc2bit_to_fastafs(std::string ucsc2bit_file, std::string fastafs_file) fh_fastafs.seekp(0, std::ios::end); - fastafs f(""); + fastafs f(""); f.load(fastafs_file); uint32_t crc32c = f.get_crc32(); diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 14277b50..2e908f74 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -195,10 +195,10 @@ BOOST_AUTO_TEST_CASE(test_cache) file.read(buffer, size); BOOST_CHECK_EQUAL(file.gcount(), size); file.close(); - + //BOOST_CHECK_UNEQUAL(ret, -1); - - + + for(unsigned int i = 0; i < size; i++) { BOOST_CHECK_EQUAL(buffer[i], reference[i]); @@ -216,7 +216,7 @@ BOOST_AUTO_TEST_CASE(test_cache) // check fastafs filesize fastafs f = fastafs(""); f.load("tmp/test_004.fastafs"); - BOOST_CHECK_EQUAL(f.fastafs_filesize() , 125); + BOOST_CHECK_EQUAL(f.fastafs_filesize(), 125); } diff --git a/test/cache/test_cache_twobit.cpp b/test/cache/test_cache_twobit.cpp index b0c33d0a..8a9c8662 100644 --- a/test/cache/test_cache_twobit.cpp +++ b/test/cache/test_cache_twobit.cpp @@ -316,7 +316,7 @@ BOOST_AUTO_TEST_CASE(test_cache) // METADATA "\x00"s // [399] no metadata fields [padding will come soon?] - + // CRC32 checksums "\x1e\x77\x77\x22"s ; @@ -348,12 +348,12 @@ BOOST_AUTO_TEST_CASE(test_cache) } delete[] buffer; - - + + // check computed file size fastafs f = fastafs(""); f.load("tmp/test_cache_test.fastafs"); - BOOST_CHECK_EQUAL( f.fastafs_filesize() , 403 ); + BOOST_CHECK_EQUAL(f.fastafs_filesize(), 403); } From d7e19ee27a9ebe7c7233d4d49d5f0180f1d20724 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Sat, 18 Jan 2020 09:14:11 +0100 Subject: [PATCH 076/119] fixed issue --- include/fastafs.hpp | 3 +-- src/fastafs.cpp | 35 ++++++++++++++++++++++++++++------- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 8a72242d..97965bb0 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -80,8 +80,7 @@ class fastafs_seq std::string sha1(ffs2f_init_seq*, std::ifstream*);// sha1 works 'fine' but is, like md5, sensitive to length extension hacks and should actually not be used for identifiers. std::string md5(ffs2f_init_seq*, std::ifstream*);// md5 works 'fine' but is, like sha1, sensitive to length extension hacks and should actually not be used for identifiers. - uint32_t n_twobits(); - uint32_t n_fourbits(); + uint32_t n_bits(); static uint32_t n_padding(uint32_t, uint32_t, uint32_t); bool get_n_offset(uint32_t, uint32_t *); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index fa6291ea..08e137ab 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -447,18 +447,39 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) -// @todo make this->n_bits -uint32_t fastafs_seq::n_twobits() +uint32_t fastafs_seq::n_bits() { + uint32_t n = this->n;// number of characters + + // minus number of masked characters + for(uint32_t i = 0; i < this->n_starts.size(); i++) { + n -= n_ends[i] - this->n_starts[i] + 1; + } + + // divided by bits per bytes + if(this->flags.is_twobit()) { + // if n actg bits is: + // 0 -> 0 + // 1,2,3 and 4 -> 1 + return (n + (twobit_byte::nucleotides_per_byte - 1)) / twobit_byte::nucleotides_per_byte; + } + else if (this->flags.is_fourbit()) { + return (n + (fourbit_byte::nucleotides_per_byte - 1)) / fourbit_byte::nucleotides_per_byte; + } + else { + return 0; // unclear yet + } +} + + +/* +uint32_t fastafs_seq::n_bits() { - // if n actg bits is: - // 0 -> 0 - // 1,2,3 and 4 -> 1 uint32_t n = this->n; for(uint32_t i = 0; i < this->n_starts.size(); i++) { n -= n_ends[i] - this->n_starts[i] + 1; } return (n + 3) / 4; -} +}*/ //@brief calculates the number of paddings found in a sequence of length N with @@ -1177,7 +1198,7 @@ size_t fastafs::fastafs_filesize(void) // compr dataa n += 4 + 4 + 4;// compressed nuc. + n blocks + m blocks - n += this->data[i]->n_twobits(); + n += this->data[i]->n_bits(); n += this->data[i]->n_starts.size() * 8; n += 16;//md5 sum, always present? n += this->data[i]->m_starts.size() * 8; From b0bcd65960d2ed08761e28aec8909016ffb4e6be Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Sat, 18 Jan 2020 09:31:47 +0100 Subject: [PATCH 077/119] sav --- include/fasta_to_fourbit_fastafs.hpp | 9 +++++---- src/fasta_to_fourbit_fastafs.cpp | 13 +++++++------ src/fastafs.cpp | 10 +++++----- src/fuse.cpp | 2 +- 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/include/fasta_to_fourbit_fastafs.hpp b/include/fasta_to_fourbit_fastafs.hpp index 58985694..c104e860 100644 --- a/include/fasta_to_fourbit_fastafs.hpp +++ b/include/fasta_to_fourbit_fastafs.hpp @@ -27,9 +27,10 @@ class fasta_seq_header_fourbit_conversion_data bool previous_was_N; - fasta_seq_header_fourbit_conversion_data(off_t fof, std::string name): - file_offset_in_fasta(fof), - name(name), + fasta_seq_header_fourbit_conversion_data(off_t arg_fof, + std::string &arg_name): + file_offset_in_fasta(arg_fof), + name(arg_name), N(0), n_actg(0), previous_was_N(false), @@ -57,5 +58,5 @@ class fasta_seq_header_fourbit_conversion_data }; -size_t fasta_to_fourbit_fastafs(const std::string, const std::string); +size_t fasta_to_fourbit_fastafs(const std::string &, const std::string &); diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index c5d4e7f6..275f8baf 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -131,7 +131,7 @@ void fasta_seq_header_fourbit_conversion_data::finish_sequence(std::ofstream &fh -size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string fastafs_file) +size_t fasta_to_fourbit_fastafs(const std::string &fasta_file, const std::string &fastafs_file) { std::vector index; fasta_seq_header_fourbit_conversion_data* s; @@ -571,7 +571,7 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string fh_fastafs.seekp(0, std::ios::end); - printf("file size now: %i\n", fh_fastafs.tellp()); + printf("file size now: %i\n", (uint32_t) fh_fastafs.tellp()); @@ -590,14 +590,15 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string terminate = true; } //printf("alive [%i]\n", fh_fastafs_crc.gcount()); - iii = fh_fastafs_crc.gcount(); + iii = (uint32_t) fh_fastafs_crc.gcount(); crc = crc32(crc, (const Bytef*)& buffer, iii); nnn += iii; if(terminate) { togo = false; } - }; + } + // -- printf("nnn = %i\n", nnn); @@ -612,15 +613,15 @@ size_t fasta_to_fourbit_fastafs(const std::string fasta_file, const std::string - // calc written size fh_fastafs.seekp(0, std::ios::end); size_t written = fh_fastafs.tellp(); - printf("file size now: %i\n", fh_fastafs.tellp()); + //printf("file size now: %i\n", fh_fastafs.tellp()); fh_fastafs.close(); return written; } + diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 08e137ab..703d4dcf 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1470,22 +1470,22 @@ uint32_t fastafs::get_crc32(void) } // starts at 4th - uint32_t total_bytes_to_be_read = this->fastafs_filesize() - 4 - 4 ; + size_t total_bytes_to_be_read = this->fastafs_filesize() - 4 - 4 ; uLong crc = crc32(0L, Z_NULL, 0); const int buffer_size = 4; char buffer[buffer_size + 1]; - uint32_t bytes_to_be_read_this_iter; - uint32_t bytes_actually_read_this_iter; + size_t bytes_to_be_read_this_iter; + size_t bytes_actually_read_this_iter; // now calculate crc32 checksum, as all bits have been set. std::ifstream fh_fastafs_crc(this->filename.c_str(), std::ios :: out | std::ios :: binary); fh_fastafs_crc.seekg(4, std::ios::beg);// skip magic number, this must be ok otherwise the toolkit won't use the file anyway while(total_bytes_to_be_read > 0) { - bytes_to_be_read_this_iter = std::min((uint32_t) buffer_size, total_bytes_to_be_read) ; + bytes_to_be_read_this_iter = std::min((size_t) buffer_size, total_bytes_to_be_read) ; fh_fastafs_crc.read(buffer, bytes_to_be_read_this_iter); total_bytes_to_be_read -= bytes_to_be_read_this_iter; @@ -1493,7 +1493,7 @@ uint32_t fastafs::get_crc32(void) if(bytes_actually_read_this_iter == 0) { total_bytes_to_be_read = 0; // unexpected eof? } else { - crc = crc32(crc, (const Bytef*)& buffer, bytes_actually_read_this_iter); + crc = crc32(crc, (const Bytef*)& buffer, (uint32_t) bytes_actually_read_this_iter); } } diff --git a/src/fuse.cpp b/src/fuse.cpp index 39a78508..7749551a 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -477,7 +477,7 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) if(full_args.size() > 2) { printf("checkpoint c\n"); - printf("full_args.size() = %i\n", full_args.size()); + printf("full_args.size() = %u\n", (uint32_t) full_args.size()); int mount_target_arg = full_args[full_args.size() - 2 ]; // last two arguments are and , location to last 2 args not starting with --/- are in this vector printf("out of bound???\n"); From fce2dfb2b3a8d979045ab296cd5ecb21b924b9e2 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Sat, 18 Jan 2020 09:36:03 +0100 Subject: [PATCH 078/119] sav --- include/fasta_to_fourbit_fastafs.hpp | 2 +- src/fastafs.cpp | 37 ++++++++++++---------------- 2 files changed, 17 insertions(+), 22 deletions(-) diff --git a/include/fasta_to_fourbit_fastafs.hpp b/include/fasta_to_fourbit_fastafs.hpp index c104e860..b26149c0 100644 --- a/include/fasta_to_fourbit_fastafs.hpp +++ b/include/fasta_to_fourbit_fastafs.hpp @@ -28,7 +28,7 @@ class fasta_seq_header_fourbit_conversion_data fasta_seq_header_fourbit_conversion_data(off_t arg_fof, - std::string &arg_name): + std::string &arg_name): file_offset_in_fasta(arg_fof), name(arg_name), N(0), diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 703d4dcf..5bd196eb 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -447,7 +447,8 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) -uint32_t fastafs_seq::n_bits() { +uint32_t fastafs_seq::n_bits() +{ uint32_t n = this->n;// number of characters // minus number of masked characters @@ -461,11 +462,9 @@ uint32_t fastafs_seq::n_bits() { // 0 -> 0 // 1,2,3 and 4 -> 1 return (n + (twobit_byte::nucleotides_per_byte - 1)) / twobit_byte::nucleotides_per_byte; - } - else if (this->flags.is_fourbit()) { + } else if(this->flags.is_fourbit()) { return (n + (fourbit_byte::nucleotides_per_byte - 1)) / fourbit_byte::nucleotides_per_byte; - } - else { + } else { return 0; // unclear yet } } @@ -1516,26 +1515,22 @@ bool fastafs::check_file_integrity() uint_to_fourbytes(buf_new, (uint32_t) crc32_current); printf("ERROR\t%02hhx%02hhx%02hhx%02hhx (in-file) != %02hhx%02hhx%02hhx%02hhx (actual file)\n--\n", - buf_old[0], - buf_old[1], - buf_old[2], - buf_old[3], - - buf_new[0], - buf_new[1], - buf_new[2], - buf_new[3] + (unsigned char) buf_old[0], + (unsigned char) buf_old[1], + (unsigned char) buf_old[2], + (unsigned char) buf_old[3], - ); + (unsigned char) buf_new[0], + (unsigned char) buf_new[1], + (unsigned char) buf_new[2], + (unsigned char) buf_new[3]); } else { printf("OK\t%02hhx%02hhx%02hhx%02hhx\n--\n", - buf_old[0], - buf_old[1], - buf_old[2], - buf_old[3] - - ); + (unsigned char) buf_old[0], + (unsigned char) buf_old[1], + (unsigned char) buf_old[2], + (unsigned char) buf_old[3]); } return (crc32_current == this->crc32f); From e9491444626c29f1b9685c07d7533e15ca28ec93 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Sat, 18 Jan 2020 09:38:30 +0100 Subject: [PATCH 079/119] sav --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 77466b2f..0845ed9e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# fastafs: fuse layer and file system for storing FASTA files +# FASTAFS: toolkit for file system virtualisation of random access compressed FASTA files ---- From 2c4af9257c3cd2fe8ef01cfdff834e7a8ac031cf Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 20 Jan 2020 15:06:44 +0100 Subject: [PATCH 080/119] sav --- src/fuse.cpp | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/fuse.cpp b/src/fuse.cpp index 7749551a..0bf28720 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -32,6 +32,8 @@ struct fuse_instance { //fastasfs fastafs *f; ffs2f_init *cache; + ffs2f_init *cache_p0;// cache with padding of 0; used by API '/seq/chr1:123:456' + bool from_fastafs; // if false, from 2bit // ucsc2bit @@ -66,6 +68,7 @@ static int do_getattr(const char *path, struct stat *st) st->st_atime = time(NULL); // The last "a"ccess of the file/directory is right now st->st_mtime = time(NULL); // The last "m"odification of the file/directory is right now + printf("[%s]\n" , path); if(strcmp(path, "/") == 0) { //st->st_mode = S_IFREG | 0644; //st->st_nlink = 1; @@ -73,6 +76,17 @@ static int do_getattr(const char *path, struct stat *st) //directory st->st_mode = S_IFDIR | 0755; st->st_nlink = 2; // Why "two" hardlinks instead of "one"? The answer is here: http://unix.stackexchange.com/a/101536 + } else if(strlen(path) == 4 && strncmp(path, "/seq", 4) == 0) { + //directory + printf("setting to DIR because /seq\n"); + st->st_mode = S_IFDIR | 0755; + st->st_nlink = 1; + } else if(strlen(path) > 4 && strncmp(path, "/seq/", 5) == 0) { + // API: "/seq/chr1:123-456" + printf("setting to FILE because /seq/...\n"); + // @ todo - run a check on wether the chr exists and return err otherwise + st->st_mode = S_IFREG | 0644; + st->st_nlink = 1; } else { st->st_mode = S_IFREG | 0644; st->st_nlink = 1; @@ -156,6 +170,8 @@ static int do_readdir(const char *path, void *buffer, fuse_fill_dir_t filler, of } } + filler(buffer, "seq", NULL, 0); // Directed indexed API access to subsequence "/seq/chr1:123-456 + return 0; } @@ -178,6 +194,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st std::string virtual_ucsc2bit_filename = "/" + ffi->f->name + ".2bit"; std::string virtual_dict_filename = "/" + ffi->f->name + ".dict"; + printf("?? [[%s]]\n", path); if(strcmp(path, virtual_fasta_filename.c_str()) == 0) { written = (signed int) ffi->f->view_fasta_chunk_cached(ffi->cache, buffer, size, offset); } else if(strcmp(path, virtual_faidx_filename.c_str()) == 0) { @@ -186,6 +203,8 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st written = (signed int) ffi->f->view_ucsc2bit_chunk(buffer, size, offset); } else if(strcmp(path, virtual_dict_filename.c_str()) == 0) { written = (signed int) ffi->f->view_dict_chunk(buffer, size, offset); + } else if(strncmp(path, "/seq/", 5) == 0) { // api access + printf("!! [[%s]]\n", path); } } else { if(ffi->u2b != nullptr) { @@ -411,7 +430,7 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) //fastafs_fuse_instance *ffi = new fastafs_fuse_instance({nullptr, 60, 1, new char[argc]}); //fastafs_fuse_instance *ffi = new fastafs_fuse_instance({nullptr, 60, 0, nullptr}); - fuse_instance *fi = new fuse_instance({nullptr, nullptr, true, nullptr, 60, 0}); + fuse_instance *fi = new fuse_instance({nullptr, nullptr, nullptr, true, nullptr, 60, 0}); //fuse option variable to send to fuse argv_fuse[fi->argc_fuse++] = (char *) "fastafs"; // becomes fuse.fastafs From 9f265cf71e62554ccec1d288a456eeb06bc5c1cd Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 20 Jan 2020 15:34:43 +0100 Subject: [PATCH 081/119] api mounting --- src/fuse.cpp | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/fuse.cpp b/src/fuse.cpp index 0bf28720..b0ece310 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -205,6 +205,35 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st written = (signed int) ffi->f->view_dict_chunk(buffer, size, offset); } else if(strncmp(path, "/seq/", 5) == 0) { // api access printf("!! [[%s]]\n", path); + // 01 : convert chrom loc to string with chr + int p = -1; + for(int i = 5; i < std::min( (int) 256, (int) strlen(path)) && p == -1; i++) { + printf(":: %c\n",path[i]); + if(path[i] == ':') { + p = i; + } + } + if(p == -1) { + p = std::min((int) 256, (int) strlen(path)); + } + std::string chr = std::string(path, 5, p - 5); + std::cout << "{" << chr << "}" << "\n"; + + // 02 : check if 'chr' is equals this->data[i].name + fastafs_seq *fsq = nullptr; + for(size_t i = 0; i < ffi->f->data.size() && fsq == nullptr; i++ ) { + printf("[%s] == [%s] \n", chr.c_str() , ffi->f->data[i]->name.c_str()); + if( chr.compare(ffi->f->data[i]->name) == 0) { + fsq = ffi->f->data[i]; + } + } + + // 03 - if chr was found , ok, otherise, not ok + if(fsq == nullptr) { + return -2;// -1 = permission deinied, -2 = missing file or directory + } else { + return 0; + } } } else { if(ffi->u2b != nullptr) { From d638300cb103dcc66863358dbc46df6d890b65d8 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 20 Jan 2020 16:41:56 +0100 Subject: [PATCH 082/119] dev stuff --- src/fastafs.cpp | 2 +- src/fuse.cpp | 61 ++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 5bd196eb..c7a379b7 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -144,7 +144,7 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( std::ifstream *fh) { - if(this->flags.is_dna()) { + if(this->flags.is_twobit()) { return this->view_fasta_chunk_cached_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); } else { return this->view_fasta_chunk_cached_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); diff --git a/src/fuse.cpp b/src/fuse.cpp index b0ece310..06a176ed 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -83,10 +83,13 @@ static int do_getattr(const char *path, struct stat *st) st->st_nlink = 1; } else if(strlen(path) > 4 && strncmp(path, "/seq/", 5) == 0) { // API: "/seq/chr1:123-456" - printf("setting to FILE because /seq/...\n"); + printf("setting to FILE [%s] because /seq/...\n", path); // @ todo - run a check on wether the chr exists and return err otherwise st->st_mode = S_IFREG | 0644; st->st_nlink = 1; + + //@todo this needs to be defined with some api stuff:!! + st->st_size = 4096; } else { st->st_mode = S_IFREG | 0644; st->st_nlink = 1; @@ -108,6 +111,8 @@ static int do_getattr(const char *path, struct stat *st) st->st_size = ffi->f->ucsc2bit_filesize(); } else if(strcmp(path, virtual_dict_filename.c_str()) == 0) { st->st_size = ffi->f->dict_filesize(); + } else if(strncmp(path, "/seq/", 5) == 0) { // api access + printf("filesize: set to 4096\n"); } } } else { @@ -171,6 +176,7 @@ static int do_readdir(const char *path, void *buffer, fuse_fill_dir_t filler, of } filler(buffer, "seq", NULL, 0); // Directed indexed API access to subsequence "/seq/chr1:123-456 + filler(buffer, "seq/chr1:123", NULL, 0); // Directed indexed API access to subsequence "/seq/chr1:123-456 return 0; } @@ -184,7 +190,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st time_t now = time(0); strftime(cur_time, 100, "%Y-%m-%d %H:%M:%S.000", localtime(&now)); - static int written = -1; + static int written = -2;// -1 = permission deinied, -2 = missing file or directory if(ffi->from_fastafs) { printf("\033[0;32m[%s]\033[0;33m fastafs::do_read(\033[0msize=%u, offset=%u\033[0;33m):\033[0m %s \033[0;35m(fastafs: %s, padding: %u)\033[0m\n", cur_time, (uint32_t) size, (uint32_t) offset, path, ffi->f->name.c_str(), ffi->padding); @@ -204,6 +210,14 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st } else if(strcmp(path, virtual_dict_filename.c_str()) == 0) { written = (signed int) ffi->f->view_dict_chunk(buffer, size, offset); } else if(strncmp(path, "/seq/", 5) == 0) { // api access + buffer[0] = 't'; + buffer[1] = 'e'; + buffer[2] = 's'; + buffer[3] = 't'; + //buffer[4] = '\0'; + written = 4; + + /* printf("!! [[%s]]\n", path); // 01 : convert chrom loc to string with chr int p = -1; @@ -221,7 +235,8 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st // 02 : check if 'chr' is equals this->data[i].name fastafs_seq *fsq = nullptr; - for(size_t i = 0; i < ffi->f->data.size() && fsq == nullptr; i++ ) { + size_t i; + for(i = 0; i < ffi->f->data.size() && fsq == nullptr; i++ ) { printf("[%s] == [%s] \n", chr.c_str() , ffi->f->data[i]->name.c_str()); if( chr.compare(ffi->f->data[i]->name) == 0) { fsq = ffi->f->data[i]; @@ -229,11 +244,40 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st } // 03 - if chr was found , ok, otherise, not ok - if(fsq == nullptr) { - return -2;// -1 = permission deinied, -2 = missing file or directory - } else { - return 0; - } + if(fsq != nullptr) { + buffer[0] = 't'; + buffer[1] = 'e'; + buffer[2] = 's'; + buffer[3] = 't'; + buffer[4] = '\0'; + written = 4096; + + // code below seems to work, but copying to buf doesn't seem to work? + + std::ifstream file(ffi->f->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + if(file.is_open()) { + printf("alive?\n"); + printf("padding: %i\n", ffi->cache_p0->sequences[i]->padding); + + written = (signed int) fsq->view_fasta_chunk_cached( + ffi->cache_p0->sequences[i], // ffs2f_init_seq* cache, + buffer, // char *buffer + (size_t) size, // size_t buffer_size, + (off_t) 0, // off_t start_pos_in_fasta, + &file // std::ifstream *fh) + ); + + printf("\nwritten: %i\n", (int) written); + + for(int kk = 0; kk < written ; kk++) { + printf("%c", buffer[kk]); + } + + printf("\nwritten: %i\n", (int) written); + + } + file.close(); + */ } } else { if(ffi->u2b != nullptr) { @@ -550,6 +594,7 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) fi->f = new fastafs(name); fi->f->load(fname); fi->cache = fi->f->init_ffs2f(fi->padding, true);// allow mixed case + fi->cache_p0 = fi->f->init_ffs2f(0, true);// allow mixed case } else { std::string basename = basename_cpp(std::string(argv[mount_target_arg])); //std::string basename = std::filesystem::path(std::string(argv[mount_target_arg])).filename(); From 1d14552185379f3c5aee796289564ab17ce949f3 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 20 Jan 2020 16:46:44 +0100 Subject: [PATCH 083/119] sav --- src/fuse.cpp | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/fuse.cpp b/src/fuse.cpp index 06a176ed..0a40dbf3 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -210,14 +210,15 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st } else if(strcmp(path, virtual_dict_filename.c_str()) == 0) { written = (signed int) ffi->f->view_dict_chunk(buffer, size, offset); } else if(strncmp(path, "/seq/", 5) == 0) { // api access + /* buffer[0] = 't'; buffer[1] = 'e'; buffer[2] = 's'; buffer[3] = 't'; //buffer[4] = '\0'; written = 4; + */ - /* printf("!! [[%s]]\n", path); // 01 : convert chrom loc to string with chr int p = -1; @@ -245,13 +246,6 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st // 03 - if chr was found , ok, otherise, not ok if(fsq != nullptr) { - buffer[0] = 't'; - buffer[1] = 'e'; - buffer[2] = 's'; - buffer[3] = 't'; - buffer[4] = '\0'; - written = 4096; - // code below seems to work, but copying to buf doesn't seem to work? std::ifstream file(ffi->f->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); @@ -263,7 +257,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st ffi->cache_p0->sequences[i], // ffs2f_init_seq* cache, buffer, // char *buffer (size_t) size, // size_t buffer_size, - (off_t) 0, // off_t start_pos_in_fasta, + (off_t) 2 + fsq->name.size(), // off_t start_pos_in_fasta, &file // std::ifstream *fh) ); @@ -277,7 +271,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st } file.close(); - */ + } } } else { if(ffi->u2b != nullptr) { From d03ec9f4fb084e43edb7797263f3c7f95be5857d Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 20 Jan 2020 17:31:42 +0100 Subject: [PATCH 084/119] sav --- CMakeLists.txt | 1 + include/SequenceRegion.hpp | 30 +++++++++++ src/SequenceRegion.cpp | 60 +++++++++++++++++++++ test/CMakeLists.txt | 3 ++ test/sequenceregion/test_sequenceregion.cpp | 34 ++++++++++++ 5 files changed, 128 insertions(+) create mode 100644 include/SequenceRegion.hpp create mode 100644 src/SequenceRegion.cpp create mode 100644 test/sequenceregion/test_sequenceregion.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index af7c0034..186818f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,6 +179,7 @@ add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs") add_test(test_fastafs_as_ucsc2bit "${BUILD_TEST_DIR}/test_fastafs_as_ucsc2bit") add_test(test_ucsc2bit_to_fastafs "${BUILD_TEST_DIR}/test_ucsc2bit_to_fastafs") add_test(test_ucsc2bit_as_fasta "${BUILD_TEST_DIR}/test_ucsc2bit_as_fasta") +add_test(test_sequenceregion "${BUILD_TEST_DIR}/test_sequenceregion") add_test(test_utils "${BUILD_TEST_DIR}/test_utils") #add_test(test_tree "${BUILD_TEST_DIR}/test_tree") diff --git a/include/SequenceRegion.hpp b/include/SequenceRegion.hpp new file mode 100644 index 00000000..18fe835c --- /dev/null +++ b/include/SequenceRegion.hpp @@ -0,0 +1,30 @@ +#include +#include +#include +#include + +#include + + +#include "config.hpp" + +#include "utils.hpp" + + + +class SequenceRegion { + std::string seq_name; + + bool has_range; + bool has_end; + + off_t start; + off_t end; + + public: + SequenceRegion(char *); + + private: + void parse(char *); +}; + diff --git a/src/SequenceRegion.cpp b/src/SequenceRegion.cpp new file mode 100644 index 00000000..c99bae78 --- /dev/null +++ b/src/SequenceRegion.cpp @@ -0,0 +1,60 @@ + +#include "SequenceRegion.hpp" + + + +SequenceRegion::SequenceRegion(char * seqstr) : + seq_name("") , has_range(false), has_end(false), start(0), end(0) { + + parse(seqstr); + +} + + +void SequenceRegion::parse(char * seqstr) { + printf("123'\n"); + printf("[%s]\n", seqstr); + printf("---\n"); + + printf("strlen: %i\n", strlen(seqstr)); + + // the + 1 is the also allow parsing "sequence-of-size-255-...-:123-345" + size_t string_max_pos = std::min(MAX_SIZE_SEQ_NAME + 1, strlen(seqstr)); + ssize_t p = -1; + for(size_t i = 0; i < string_max_pos && p == -1; i++) { + if(seqstr[i] == ':') { + p = (size_t) i; + } + } + + printf("p = %i\n", (int) p); + + if(p > 0) { + this->seq_name = std::string(seqstr , 0 , p); + printf("| %s |\n", this->seq_name.c_str()); + } else if(p == -1) { + + // either with string > 255 chars or string smaller than 255 without ':' + this->seq_name = std::string(seqstr , 0 , string_max_pos); + printf(": %s :\n", this->seq_name.c_str()); + } + + if(p != -1) { + // we can parse numbers + } + + +/* + for(int i = 5; i < std::min( (int) 256, (int) strlen(path)) && p == -1; i++) { + printf(":: %c\n",path[i]); + if(path[i] == ':') { + p = i; + } + } + if(p == -1) { + p = std::min((int) 256, (int) strlen(path)); + } + std::string chr = std::string(path, 5, p - 5); + std::cout << "{" << chr << "}" << "\n"; + */ +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a6c92bdd..807041c1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -28,6 +28,7 @@ add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_sequenceregion sequenceregion/test_sequenceregion.cpp ../src/SequenceRegion.cpp) add_executable(test_utils utils/test_utils.cpp ../src/utils.cpp) #add_executable(test_tree tree/test_tree.cpp) @@ -54,6 +55,8 @@ set_target_properties(test_ucsc2bit_to_fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_ucsc2bit_as_fasta PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_sequenceregion + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_utils PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") #set_target_properties(test_tree diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp new file mode 100644 index 00000000..638bec91 --- /dev/null +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -0,0 +1,34 @@ +#define BOOST_TEST_MODULE sequence_region + +#include + +#include "config.hpp" + +#include "SequenceRegion.hpp" + + +//#include +//#include + + +BOOST_AUTO_TEST_SUITE(Testing) + + +BOOST_AUTO_TEST_CASE(test_sequence_region) +{ + /* + * Goal is to parse the following strings: + * "chr1" + * "chr1:" + * "chr1:123-456" + * "chr1:123-" + * "chr1:123-456:asdasd" error + * "chr1:-456" == "chr1:0-456" + */ + + char arg[] = "/seq/chr1"; + + SequenceRegion sr = SequenceRegion(&(arg[5])); +} + +BOOST_AUTO_TEST_SUITE_END() From 2e110609d01740dc504e185c1be05b2290cb67d6 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 20 Jan 2020 17:33:13 +0100 Subject: [PATCH 085/119] sav --- src/SequenceRegion.cpp | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/SequenceRegion.cpp b/src/SequenceRegion.cpp index c99bae78..e4d9d314 100644 --- a/src/SequenceRegion.cpp +++ b/src/SequenceRegion.cpp @@ -41,20 +41,8 @@ void SequenceRegion::parse(char * seqstr) { if(p != -1) { // we can parse numbers + + // find position of '-' character } - -/* - for(int i = 5; i < std::min( (int) 256, (int) strlen(path)) && p == -1; i++) { - printf(":: %c\n",path[i]); - if(path[i] == ':') { - p = i; - } - } - if(p == -1) { - p = std::min((int) 256, (int) strlen(path)); - } - std::string chr = std::string(path, 5, p - 5); - std::cout << "{" << chr << "}" << "\n"; - */ } From c911a32ad52d2dfd0182f96d669fdf02343eac7b Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 20 Jan 2020 20:12:11 +0100 Subject: [PATCH 086/119] closer to resolving --- include/SequenceRegion.hpp | 16 ++++----- include/config.hpp.in | 4 +++ src/SequenceRegion.cpp | 28 +++++++++++++-- test/sequenceregion/test_sequenceregion.cpp | 38 +++++++++++++++++---- 4 files changed, 69 insertions(+), 17 deletions(-) diff --git a/include/SequenceRegion.hpp b/include/SequenceRegion.hpp index 18fe835c..a3788c29 100644 --- a/include/SequenceRegion.hpp +++ b/include/SequenceRegion.hpp @@ -13,16 +13,16 @@ class SequenceRegion { - std::string seq_name; - - bool has_range; - bool has_end; - - off_t start; - off_t end; - public: SequenceRegion(char *); + + std::string seq_name; + + bool has_range; + bool has_end; + + off_t start; + off_t end; private: void parse(char *); diff --git a/include/config.hpp.in b/include/config.hpp.in index 25046075..048d2d77 100644 --- a/include/config.hpp.in +++ b/include/config.hpp.in @@ -64,4 +64,8 @@ static const std::string DICT_HEADER = "@HD\tVN:1.0\tSO:unsorted\n"; static const std::string FASTAFS_FILE_XATTR_NAME = "fastafs-file"; static const std::string FASTAFS_PID_XATTR_NAME = "fastafs-pid"; + +static const size_t MAX_SIZE_SEQ_NAME = 255; + + #endif diff --git a/src/SequenceRegion.cpp b/src/SequenceRegion.cpp index e4d9d314..c0767352 100644 --- a/src/SequenceRegion.cpp +++ b/src/SequenceRegion.cpp @@ -39,10 +39,34 @@ void SequenceRegion::parse(char * seqstr) { printf(": %s :\n", this->seq_name.c_str()); } - if(p != -1) { + printf("\n"); + // chr1:1 + // p = 4 + // strlen = 6 + if(p != -1 and strlen(seqstr) > (p + 1)) { // we can parse numbers - // find position of '-' character + + ssize_t p2 = -1; + + for(size_t i = p; i < strlen(seqstr) && p2 == -1; i++) { + if(seqstr[i] == ':') { + p2 = (size_t) i; + } + } + + printf("p2 = %i , p = %i\n", p2, p); + if(p2 == p) {// chrA:-123 + printf("yes!\n"); + this->start = 0; + printf(" end = [%s]\n", std::string(seqstr,p + 2,strlen(seqstr)).c_str()); + this->end = std::stoi( std::string(seqstr,p + 2,strlen(seqstr)) ) ; + } + else if(p2 > strlen(seqstr) + 2) { + + } + + printf("p2 = %i\n", (int) p); } } diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp index 638bec91..de6d78f2 100644 --- a/test/sequenceregion/test_sequenceregion.cpp +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -13,22 +13,46 @@ BOOST_AUTO_TEST_SUITE(Testing) - -BOOST_AUTO_TEST_CASE(test_sequence_region) -{ - /* +/* * Goal is to parse the following strings: * "chr1" * "chr1:" + * "chr1:123" # single base * "chr1:123-456" * "chr1:123-" * "chr1:123-456:asdasd" error * "chr1:-456" == "chr1:0-456" */ - - char arg[] = "/seq/chr1"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + +BOOST_AUTO_TEST_CASE(test_sequence_region) +{ + { + char arg[] = "/seq/chr1"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); + } + + { + char arg[] = "/seq/chr1:"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); + } + + + { + char arg[] = "/seq/chr1:-123"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); + + BOOST_CHECK_EQUAL(sr.start , 0); // zero based as defined in SAM specification + BOOST_CHECK_EQUAL(sr.end , 123); // + } + + } BOOST_AUTO_TEST_SUITE_END() From db1fbeeb4a06fa6fd02e7191dbce9098e05ae0c7 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 20 Jan 2020 21:13:48 +0100 Subject: [PATCH 087/119] sav --- include/SequenceRegion.hpp | 3 +- src/SequenceRegion.cpp | 129 ++++++++++--------- test/sequenceregion/test_sequenceregion.cpp | 131 +++++++++++++++++++- 3 files changed, 199 insertions(+), 64 deletions(-) diff --git a/include/SequenceRegion.hpp b/include/SequenceRegion.hpp index a3788c29..8e6d3931 100644 --- a/include/SequenceRegion.hpp +++ b/include/SequenceRegion.hpp @@ -18,8 +18,7 @@ class SequenceRegion { std::string seq_name; - bool has_range; - bool has_end; + bool has_defined_end; off_t start; off_t end; diff --git a/src/SequenceRegion.cpp b/src/SequenceRegion.cpp index c0767352..22d286eb 100644 --- a/src/SequenceRegion.cpp +++ b/src/SequenceRegion.cpp @@ -4,69 +4,82 @@ SequenceRegion::SequenceRegion(char * seqstr) : - seq_name("") , has_range(false), has_end(false), start(0), end(0) { + seq_name("") , has_defined_end(false), start(0), end(0) { - parse(seqstr); + parse(seqstr); } void SequenceRegion::parse(char * seqstr) { - printf("123'\n"); - printf("[%s]\n", seqstr); - printf("---\n"); - - printf("strlen: %i\n", strlen(seqstr)); - - // the + 1 is the also allow parsing "sequence-of-size-255-...-:123-345" - size_t string_max_pos = std::min(MAX_SIZE_SEQ_NAME + 1, strlen(seqstr)); - ssize_t p = -1; - for(size_t i = 0; i < string_max_pos && p == -1; i++) { - if(seqstr[i] == ':') { - p = (size_t) i; - } - } - - printf("p = %i\n", (int) p); - - if(p > 0) { - this->seq_name = std::string(seqstr , 0 , p); - printf("| %s |\n", this->seq_name.c_str()); - } else if(p == -1) { - - // either with string > 255 chars or string smaller than 255 without ':' - this->seq_name = std::string(seqstr , 0 , string_max_pos); - printf(": %s :\n", this->seq_name.c_str()); - } - - printf("\n"); - // chr1:1 - // p = 4 - // strlen = 6 - if(p != -1 and strlen(seqstr) > (p + 1)) { - // we can parse numbers - // find position of '-' character - - ssize_t p2 = -1; - - for(size_t i = p; i < strlen(seqstr) && p2 == -1; i++) { - if(seqstr[i] == ':') { - p2 = (size_t) i; - } - } - - printf("p2 = %i , p = %i\n", p2, p); - if(p2 == p) {// chrA:-123 - printf("yes!\n"); - this->start = 0; - printf(" end = [%s]\n", std::string(seqstr,p + 2,strlen(seqstr)).c_str()); - this->end = std::stoi( std::string(seqstr,p + 2,strlen(seqstr)) ) ; - } - else if(p2 > strlen(seqstr) + 2) { - - } - - printf("p2 = %i\n", (int) p); - } + // the + 1 is the also allow parsing "sequence-of-size-255-...-:123-345" + size_t string_max_pos = std::min(MAX_SIZE_SEQ_NAME + 1, strlen(seqstr)); + ssize_t p = -1; + for(size_t i = 0; i < string_max_pos && p == -1; i++) { + if(seqstr[i] == ':') { + p = (size_t) i; + } + } + + if(p > 0) { + this->seq_name = std::string(seqstr , 0 , p); + } else if(p == -1) { + + // either with string > 255 chars or string smaller than 255 without ':' + this->seq_name = std::string(seqstr , 0 , string_max_pos); + } + + // chr1:1 + // p = 4 + // strlen = 6 + if(p != -1 and strlen(seqstr) > (p + 1)) { + // we can parse numbers + // find position of '-' character + ssize_t p2 = -1; + + for(size_t i = p; i < strlen(seqstr) && p2 == -1; i++) { + if(seqstr[i] == '-') { + p2 = (size_t) i; + } + } + + + if(p2 == -1) { // chrA:123 + std::string start = std::string(seqstr,p + 1,p2 - p - 1); + + this->start = std::stoi( start ); + + this->has_defined_end = true; + this->end = this->start; + } else if(p2 == (p + 1)) {// chrA:-123 + std::string end = std::string(seqstr,p2 + 1,strlen(seqstr) - p2 - 1); + + this->start = 0; + this->end = std::stoi( end ) ; + + this->has_defined_end = true; + } else if(p2 > (p + 1)) { // chrA:123- | chrA:123-456 | chrA:123-456ERR + if(p2 + 1 == strlen(seqstr)) { // chrA:123- + std::string start = std::string(seqstr,p + 1,p2 - p - 1); + + this->start = std::stoi(start); + this->has_defined_end = false; + } else { // chrA:123-456 | chrA:123-456ERR + std::string start = std::string(seqstr,p + 1,p2 - p - 1); + std::string end = std::string(seqstr,p2 + 1,strlen(seqstr) - p2 - 1); + + + this->start = std::stoi( start ) ; + + this->has_defined_end = true; + this->end = std::stoi( end ) ; + } + } + + } + + if(this->has_defined_end and this->start > this->end) { + throw std::invalid_argument("Invalid region - start larger than end."); + } } diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp index de6d78f2..f4442649 100644 --- a/test/sequenceregion/test_sequenceregion.cpp +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -18,10 +18,11 @@ BOOST_AUTO_TEST_SUITE(Testing) * "chr1" * "chr1:" * "chr1:123" # single base - * "chr1:123-456" * "chr1:123-" - * "chr1:123-456:asdasd" error + * "chr1:123-456" * "chr1:-456" == "chr1:0-456" + + * "chr1:123-456:asdasd" error */ @@ -32,6 +33,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) SequenceRegion sr = SequenceRegion(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end } { @@ -39,20 +41,141 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) SequenceRegion sr = SequenceRegion(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end } + { + char arg[] = "/seq/chr1:123"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); + BOOST_CHECK_EQUAL(sr.start , 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end , true); + BOOST_CHECK_EQUAL(sr.end , 123); + } { char arg[] = "/seq/chr1:-123"; SequenceRegion sr = SequenceRegion(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); + BOOST_CHECK_EQUAL(sr.start , 0); - BOOST_CHECK_EQUAL(sr.start , 0); // zero based as defined in SAM specification - BOOST_CHECK_EQUAL(sr.end , 123); // + BOOST_CHECK_EQUAL(sr.has_defined_end , true); + BOOST_CHECK_EQUAL(sr.end , 123); } + + { + char arg[] = "/seq/chr1:123-456"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); + BOOST_CHECK_EQUAL(sr.start , 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end , true); + BOOST_CHECK_EQUAL(sr.end , 456); + } + + + { + char arg[] = "/seq/chr1:123-"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); + BOOST_CHECK_EQUAL(sr.start , 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end , false); + //BOOST_CHECK_EQUAL(sr.end , 456); - underfined + } + + { + char arg[] = "/seq/chr1:456-123"; + + SequenceRegion *sr = nullptr; + if(sr == nullptr) {// compiler doesn't understand this otherwise + BOOST_CHECK_THROW(sr = new SequenceRegion(&(arg[5])) , std::invalid_argument); + } + } } + + +BOOST_AUTO_TEST_CASE(test_sequence_region2) +{ + { + char arg[] = "/seq/chrRr1"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chrRr1:"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chrRr1:123"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.start , 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end , true); + BOOST_CHECK_EQUAL(sr.end , 123); + } + + { + char arg[] = "/seq/chrRr1:-123"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.start , 0); + + BOOST_CHECK_EQUAL(sr.has_defined_end , true); + BOOST_CHECK_EQUAL(sr.end , 123); + } + + { + char arg[] = "/seq/chrRr1:123-456"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.start , 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end , true); + BOOST_CHECK_EQUAL(sr.end , 456); + } + + + { + char arg[] = "/seq/chrRr1:123-"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.start , 123); + + BOOST_CHECK_EQUAL(sr.has_defined_end , false); + //BOOST_CHECK_EQUAL(sr.end , 456); - underfined + } + + { + char arg[] = "/seq/chrRr1:456-123"; + + SequenceRegion *sr = nullptr; + if(sr == nullptr) {// compiler doesn't understand this otherwise + BOOST_CHECK_THROW(sr = new SequenceRegion(&(arg[5])) , std::invalid_argument); + } + } + +} + + + BOOST_AUTO_TEST_SUITE_END() From c3f0812e156079e9f1a4d40f71db7bee80c2a58d Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 20 Jan 2020 21:20:06 +0100 Subject: [PATCH 088/119] more tests done --- test/sequenceregion/test_sequenceregion.cpp | 77 +++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp index f4442649..b84f4955 100644 --- a/test/sequenceregion/test_sequenceregion.cpp +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -102,6 +102,83 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) +BOOST_AUTO_TEST_CASE(test_sequence_region3) +{ + { + char arg[] = "/seq/chrRr1"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chrRr1:"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + } + + { + char arg[] = "/seq/chrRr1:1234"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.start , 1234); + + BOOST_CHECK_EQUAL(sr.has_defined_end , true); + BOOST_CHECK_EQUAL(sr.end , 1234); + } + + { + char arg[] = "/seq/chrRr1:-1234"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.start , 0); + + BOOST_CHECK_EQUAL(sr.has_defined_end , true); + BOOST_CHECK_EQUAL(sr.end , 1234); + } + + { + char arg[] = "/seq/chrRr1:1234-1235"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.start , 1234); + + BOOST_CHECK_EQUAL(sr.has_defined_end , true); + BOOST_CHECK_EQUAL(sr.end , 1235); + } + + + { + char arg[] = "/seq/chrRr1:1234-"; + SequenceRegion sr = SequenceRegion(&(arg[5])); + + BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); + BOOST_CHECK_EQUAL(sr.start , 1234); + + BOOST_CHECK_EQUAL(sr.has_defined_end , false); + //BOOST_CHECK_EQUAL(sr.end , 1235); - underfined + } + + { + char arg[] = "/seq/chrRr1:1235-1234"; + + SequenceRegion *sr = nullptr; + if(sr == nullptr) {// compiler doesn't understand this otherwise + BOOST_CHECK_THROW(sr = new SequenceRegion(&(arg[5])) , std::invalid_argument); + } + } + +} + + + + BOOST_AUTO_TEST_CASE(test_sequence_region2) { { From 10b1cef724cd342dcaa5c94671deb14f38957390 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 21 Jan 2020 09:17:31 +0100 Subject: [PATCH 089/119] api type access works --- CMakeLists.txt | 48 +++++++++-------- ...SequenceRegion.hpp => sequence_region.hpp} | 7 +-- src/fuse.cpp | 54 +++++++++---------- ...SequenceRegion.cpp => sequence_region.cpp} | 13 +++-- test/CMakeLists.txt | 2 +- test/sequenceregion/test_sequenceregion.cpp | 50 ++++++++--------- 6 files changed, 91 insertions(+), 83 deletions(-) rename include/{SequenceRegion.hpp => sequence_region.hpp} (69%) rename src/{SequenceRegion.cpp => sequence_region.cpp} (87%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 186818f4..357e221c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,6 +111,7 @@ add_executable(fastafs src/fourbit_byte.cpp src/database.cpp src/utils.cpp + src/sequence_region.cpp src/fuse.cpp src/lsfastafs.cpp ) @@ -128,35 +129,36 @@ add_executable(mount.fastafs src/fourbit_byte.cpp src/database.cpp src/utils.cpp + src/sequence_region.cpp src/fuse.cpp src/lsfastafs.cpp ) set_target_properties(mount.fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}") -add_library(libfastafs SHARED - src/fasta_to_twobit_fastafs.cpp - src/ucsc2bit_to_fastafs.cpp - src/flags.cpp - src/fastafs.cpp - src/ucsc2bit.cpp - src/twobit_byte.cpp - src/fourbit_byte.cpp - src/database.cpp - src/utils.cpp - src/fuse.cpp - src/lsfastafs.cpp -) -target_include_directories(libfastafs PUBLIC include) -target_sources(libfastafs PUBLIC include/fastafs.hpp) - -set_target_properties(libfastafs PROPERTIES LIBRARY_OUTPUT_DIRECTORY "lib") -set_target_properties(libfastafs PROPERTIES VERSION ${PROJECT_VERSION}) -set_target_properties(libfastafs PROPERTIES SOVERSION 1) -set_target_properties(libfastafs PROPERTIES OUTPUT_NAME fastafs) +#add_library(libfastafs SHARED + #src/fasta_to_twobit_fastafs.cpp + #src/ucsc2bit_to_fastafs.cpp + #src/flags.cpp + #src/fastafs.cpp + #src/ucsc2bit.cpp + #src/twobit_byte.cpp + #src/fourbit_byte.cpp + #src/database.cpp + #src/utils.cpp + #src/fuse.cpp + #src/lsfastafs.cpp +#) +#target_include_directories(libfastafs PUBLIC include) +#target_sources(libfastafs PUBLIC include/fastafs.hpp) + +#set_target_properties(libfastafs PROPERTIES LIBRARY_OUTPUT_DIRECTORY "lib") +#set_target_properties(libfastafs PROPERTIES VERSION ${PROJECT_VERSION}) +#set_target_properties(libfastafs PROPERTIES SOVERSION 1) +#set_target_properties(libfastafs PROPERTIES OUTPUT_NAME fastafs) #set_target_properties(libfastafs PROPERTIES HEADER_OUTPUT_DIRECTORY "include") # great, this doesn't go automagically with an entire dir -set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_twobit_fastafs.hpp;include/fuse.hpp;include/meson.build;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp") +#set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_twobit_fastafs.hpp;include/fuse.hpp;include/meson.build;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp") #set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_DIRECTORY include) #set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_OUTPUT_DIRECTORY "include") @@ -192,6 +194,8 @@ add_test(test_utils "${BUILD_TEST_DIR}/test_utils") # The compiled binary, usually to: /usr/local/bin/fastafs install(TARGETS fastafs DESTINATION "bin") install(TARGETS mount.fastafs DESTINATION "bin") -install(TARGETS libfastafs LIBRARY DESTINATION "lib" PUBLIC_HEADER DESTINATION "include/libfastafs") + +# don't build during debug at least +#install(TARGETS libfastafs LIBRARY DESTINATION "lib" PUBLIC_HEADER DESTINATION "include/libfastafs") # ---------------------------------------------------------------------- diff --git a/include/SequenceRegion.hpp b/include/sequence_region.hpp similarity index 69% rename from include/SequenceRegion.hpp rename to include/sequence_region.hpp index 8e6d3931..16f5b3a7 100644 --- a/include/SequenceRegion.hpp +++ b/include/sequence_region.hpp @@ -12,9 +12,10 @@ -class SequenceRegion { +class sequence_region { public: - SequenceRegion(char *); + sequence_region(char *); + sequence_region(const char *); std::string seq_name; @@ -24,6 +25,6 @@ class SequenceRegion { off_t end; private: - void parse(char *); + void parse(const char *); }; diff --git a/src/fuse.cpp b/src/fuse.cpp index 0a40dbf3..36ecdd14 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -21,6 +21,7 @@ #include "database.hpp" #include "fastafs.hpp" #include "ucsc2bit.hpp" +#include "sequence_region.hpp" // http://www.maastaar.net/fuse/linux/filesystem/c/2016/05/21/writing-a-simple-filesystem-using-fuse/ @@ -210,36 +211,15 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st } else if(strcmp(path, virtual_dict_filename.c_str()) == 0) { written = (signed int) ffi->f->view_dict_chunk(buffer, size, offset); } else if(strncmp(path, "/seq/", 5) == 0) { // api access - /* - buffer[0] = 't'; - buffer[1] = 'e'; - buffer[2] = 's'; - buffer[3] = 't'; - //buffer[4] = '\0'; - written = 4; - */ - - printf("!! [[%s]]\n", path); - // 01 : convert chrom loc to string with chr - int p = -1; - for(int i = 5; i < std::min( (int) 256, (int) strlen(path)) && p == -1; i++) { - printf(":: %c\n",path[i]); - if(path[i] == ':') { - p = i; - } - } - if(p == -1) { - p = std::min((int) 256, (int) strlen(path)); - } - std::string chr = std::string(path, 5, p - 5); - std::cout << "{" << chr << "}" << "\n"; + // parse "chr..:..-.." string + sequence_region sr = sequence_region( (strchr(path, '/') + 5) ); + std::cout << "[" << sr.seq_name << "]\n"; // 02 : check if 'chr' is equals this->data[i].name fastafs_seq *fsq = nullptr; size_t i; for(i = 0; i < ffi->f->data.size() && fsq == nullptr; i++ ) { - printf("[%s] == [%s] \n", chr.c_str() , ffi->f->data[i]->name.c_str()); - if( chr.compare(ffi->f->data[i]->name) == 0) { + if(sr.seq_name.compare(ffi->f->data[i]->name) == 0) { fsq = ffi->f->data[i]; } } @@ -247,17 +227,31 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st // 03 - if chr was found , ok, otherise, not ok if(fsq != nullptr) { // code below seems to work, but copying to buf doesn't seem to work? - std::ifstream file(ffi->f->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); if(file.is_open()) { - printf("alive?\n"); + size_t total_requested_size; + if(sr.has_defined_end) { + total_requested_size = sr.end + 1; + } + else { + total_requested_size = fsq->n; + } + printf("total requested length: %i\n", (int) total_requested_size); + + total_requested_size -= sr.start; + printf("total requested length: %i\n", (int) total_requested_size); + + total_requested_size = std::min(size, total_requested_size); + printf("total requested length: %i\n", (int) total_requested_size); + + printf("padding: %i\n", ffi->cache_p0->sequences[i]->padding); written = (signed int) fsq->view_fasta_chunk_cached( ffi->cache_p0->sequences[i], // ffs2f_init_seq* cache, buffer, // char *buffer - (size_t) size, // size_t buffer_size, - (off_t) 2 + fsq->name.size(), // off_t start_pos_in_fasta, + (size_t) total_requested_size, // size_t buffer_size, + (off_t) 2 + fsq->name.size() + sr.start, // off_t start_pos_in_fasta, &file // std::ifstream *fh) ); @@ -271,6 +265,8 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st } file.close(); + } else { + // should return exit code of not 0 } } } else { diff --git a/src/SequenceRegion.cpp b/src/sequence_region.cpp similarity index 87% rename from src/SequenceRegion.cpp rename to src/sequence_region.cpp index 22d286eb..2f785ec8 100644 --- a/src/SequenceRegion.cpp +++ b/src/sequence_region.cpp @@ -1,9 +1,16 @@ -#include "SequenceRegion.hpp" +#include "sequence_region.hpp" -SequenceRegion::SequenceRegion(char * seqstr) : +sequence_region::sequence_region(char * seqstr) : + seq_name("") , has_defined_end(false), start(0), end(0) { + + parse((const char *) seqstr);// char* can be converted to cost char*, but not vice versa + +} + +sequence_region::sequence_region(const char * seqstr) : seq_name("") , has_defined_end(false), start(0), end(0) { parse(seqstr); @@ -11,7 +18,7 @@ SequenceRegion::SequenceRegion(char * seqstr) : } -void SequenceRegion::parse(char * seqstr) { +void sequence_region::parse(const char * seqstr) { // the + 1 is the also allow parsing "sequence-of-size-255-...-:123-345" size_t string_max_pos = std::min(MAX_SIZE_SEQ_NAME + 1, strlen(seqstr)); ssize_t p = -1; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 807041c1..6a17c0d7 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -28,7 +28,7 @@ add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_sequenceregion sequenceregion/test_sequenceregion.cpp ../src/SequenceRegion.cpp) +add_executable(test_sequenceregion sequenceregion/test_sequenceregion.cpp ../src/sequence_region.cpp) add_executable(test_utils utils/test_utils.cpp ../src/utils.cpp) #add_executable(test_tree tree/test_tree.cpp) diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp index b84f4955..cf2dacfd 100644 --- a/test/sequenceregion/test_sequenceregion.cpp +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -4,7 +4,7 @@ #include "config.hpp" -#include "SequenceRegion.hpp" +#include "sequence_region.hpp" //#include @@ -30,7 +30,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) { { char arg[] = "/seq/chr1"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end @@ -38,7 +38,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) { char arg[] = "/seq/chr1:"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end @@ -46,7 +46,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) { char arg[] = "/seq/chr1:123"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); BOOST_CHECK_EQUAL(sr.start , 123); @@ -57,7 +57,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) { char arg[] = "/seq/chr1:-123"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); BOOST_CHECK_EQUAL(sr.start , 0); @@ -68,7 +68,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) { char arg[] = "/seq/chr1:123-456"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); BOOST_CHECK_EQUAL(sr.start , 123); @@ -80,7 +80,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) { char arg[] = "/seq/chr1:123-"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); BOOST_CHECK_EQUAL(sr.start , 123); @@ -92,9 +92,9 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) { char arg[] = "/seq/chr1:456-123"; - SequenceRegion *sr = nullptr; + sequence_region *sr = nullptr; if(sr == nullptr) {// compiler doesn't understand this otherwise - BOOST_CHECK_THROW(sr = new SequenceRegion(&(arg[5])) , std::invalid_argument); + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])) , std::invalid_argument); } } @@ -106,7 +106,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) { { char arg[] = "/seq/chrRr1"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end @@ -114,7 +114,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) { char arg[] = "/seq/chrRr1:"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end @@ -122,7 +122,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) { char arg[] = "/seq/chrRr1:1234"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.start , 1234); @@ -133,7 +133,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) { char arg[] = "/seq/chrRr1:-1234"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.start , 0); @@ -144,7 +144,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) { char arg[] = "/seq/chrRr1:1234-1235"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.start , 1234); @@ -156,7 +156,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) { char arg[] = "/seq/chrRr1:1234-"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.start , 1234); @@ -168,9 +168,9 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) { char arg[] = "/seq/chrRr1:1235-1234"; - SequenceRegion *sr = nullptr; + sequence_region *sr = nullptr; if(sr == nullptr) {// compiler doesn't understand this otherwise - BOOST_CHECK_THROW(sr = new SequenceRegion(&(arg[5])) , std::invalid_argument); + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])) , std::invalid_argument); } } @@ -183,7 +183,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) { { char arg[] = "/seq/chrRr1"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end @@ -191,7 +191,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) { char arg[] = "/seq/chrRr1:"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end @@ -199,7 +199,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) { char arg[] = "/seq/chrRr1:123"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.start , 123); @@ -210,7 +210,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) { char arg[] = "/seq/chrRr1:-123"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.start , 0); @@ -221,7 +221,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) { char arg[] = "/seq/chrRr1:123-456"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.start , 123); @@ -233,7 +233,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) { char arg[] = "/seq/chrRr1:123-"; - SequenceRegion sr = SequenceRegion(&(arg[5])); + sequence_region sr = sequence_region(&(arg[5])); BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); BOOST_CHECK_EQUAL(sr.start , 123); @@ -245,9 +245,9 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) { char arg[] = "/seq/chrRr1:456-123"; - SequenceRegion *sr = nullptr; + sequence_region *sr = nullptr; if(sr == nullptr) {// compiler doesn't understand this otherwise - BOOST_CHECK_THROW(sr = new SequenceRegion(&(arg[5])) , std::invalid_argument); + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])) , std::invalid_argument); } } From 7c0583e995626f6eefd8d9c74f9b4443efd3de49 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 21 Jan 2020 09:28:46 +0100 Subject: [PATCH 090/119] tidy --- include/sequence_region.hpp | 29 ++--- src/fuse.cpp | 37 +++--- src/sequence_region.cpp | 41 +++--- test/sequenceregion/test_sequenceregion.cpp | 132 ++++++++++---------- 4 files changed, 118 insertions(+), 121 deletions(-) diff --git a/include/sequence_region.hpp b/include/sequence_region.hpp index 16f5b3a7..15dd6718 100644 --- a/include/sequence_region.hpp +++ b/include/sequence_region.hpp @@ -12,19 +12,20 @@ -class sequence_region { - public: - sequence_region(char *); - sequence_region(const char *); - - std::string seq_name; - - bool has_defined_end; - - off_t start; - off_t end; - - private: - void parse(const char *); +class sequence_region +{ +public: + sequence_region(char *); + sequence_region(const char *); + + std::string seq_name; + + bool has_defined_end; + + off_t start; + off_t end; + +private: + void parse(const char *); }; diff --git a/src/fuse.cpp b/src/fuse.cpp index 36ecdd14..0f2ffc28 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -69,7 +69,7 @@ static int do_getattr(const char *path, struct stat *st) st->st_atime = time(NULL); // The last "a"ccess of the file/directory is right now st->st_mtime = time(NULL); // The last "m"odification of the file/directory is right now - printf("[%s]\n" , path); + printf("[%s]\n", path); if(strcmp(path, "/") == 0) { //st->st_mode = S_IFREG | 0644; //st->st_nlink = 1; @@ -176,8 +176,9 @@ static int do_readdir(const char *path, void *buffer, fuse_fill_dir_t filler, of } } - filler(buffer, "seq", NULL, 0); // Directed indexed API access to subsequence "/seq/chr1:123-456 - filler(buffer, "seq/chr1:123", NULL, 0); // Directed indexed API access to subsequence "/seq/chr1:123-456 + if(strcmp(path, "/") == 0) { // If the user is trying to show the files/directories of the root directory show the following + filler(buffer, "seq", NULL, 0); // Directed indexed API access to subsequence "/seq/chr1:123-456 + } return 0; } @@ -212,13 +213,13 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st written = (signed int) ffi->f->view_dict_chunk(buffer, size, offset); } else if(strncmp(path, "/seq/", 5) == 0) { // api access // parse "chr..:..-.." string - sequence_region sr = sequence_region( (strchr(path, '/') + 5) ); + sequence_region sr = sequence_region((strchr(path, '/') + 5)); std::cout << "[" << sr.seq_name << "]\n"; // 02 : check if 'chr' is equals this->data[i].name fastafs_seq *fsq = nullptr; size_t i; - for(i = 0; i < ffi->f->data.size() && fsq == nullptr; i++ ) { + for(i = 0; i < ffi->f->data.size() && fsq == nullptr; i++) { if(sr.seq_name.compare(ffi->f->data[i]->name) == 0) { fsq = ffi->f->data[i]; } @@ -232,37 +233,29 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st size_t total_requested_size; if(sr.has_defined_end) { total_requested_size = sr.end + 1; - } - else { + } else { total_requested_size = fsq->n; } printf("total requested length: %i\n", (int) total_requested_size); - + total_requested_size -= sr.start; printf("total requested length: %i\n", (int) total_requested_size); total_requested_size = std::min(size, total_requested_size); printf("total requested length: %i\n", (int) total_requested_size); - - printf("padding: %i\n", ffi->cache_p0->sequences[i]->padding); - written = (signed int) fsq->view_fasta_chunk_cached( - ffi->cache_p0->sequences[i], // ffs2f_init_seq* cache, - buffer, // char *buffer - (size_t) total_requested_size, // size_t buffer_size, - (off_t) 2 + fsq->name.size() + sr.start, // off_t start_pos_in_fasta, - &file // std::ifstream *fh) - ); - - printf("\nwritten: %i\n", (int) written); - + ffi->cache_p0->sequences[i], // ffs2f_init_seq* cache, + buffer, // char *buffer + (size_t) total_requested_size, // size_t buffer_size, + (off_t) 2 + fsq->name.size() + sr.start, // off_t start_pos_in_fasta, + &file // std::ifstream *fh) + ); + for(int kk = 0; kk < written ; kk++) { printf("%c", buffer[kk]); } - printf("\nwritten: %i\n", (int) written); - } file.close(); } else { diff --git a/src/sequence_region.cpp b/src/sequence_region.cpp index 2f785ec8..f87e05dc 100644 --- a/src/sequence_region.cpp +++ b/src/sequence_region.cpp @@ -4,21 +4,24 @@ sequence_region::sequence_region(char * seqstr) : - seq_name("") , has_defined_end(false), start(0), end(0) { + seq_name(""), has_defined_end(false), start(0), end(0) +{ parse((const char *) seqstr);// char* can be converted to cost char*, but not vice versa } sequence_region::sequence_region(const char * seqstr) : - seq_name("") , has_defined_end(false), start(0), end(0) { + seq_name(""), has_defined_end(false), start(0), end(0) +{ parse(seqstr); } -void sequence_region::parse(const char * seqstr) { +void sequence_region::parse(const char * seqstr) +{ // the + 1 is the also allow parsing "sequence-of-size-255-...-:123-345" size_t string_max_pos = std::min(MAX_SIZE_SEQ_NAME + 1, strlen(seqstr)); ssize_t p = -1; @@ -27,15 +30,15 @@ void sequence_region::parse(const char * seqstr) { p = (size_t) i; } } - + if(p > 0) { - this->seq_name = std::string(seqstr , 0 , p); + this->seq_name = std::string(seqstr, 0, p); } else if(p == -1) { - + // either with string > 255 chars or string smaller than 255 without ':' - this->seq_name = std::string(seqstr , 0 , string_max_pos); + this->seq_name = std::string(seqstr, 0, string_max_pos); } - + // chr1:1 // p = 4 // strlen = 6 @@ -51,36 +54,36 @@ void sequence_region::parse(const char * seqstr) { } } - + if(p2 == -1) { // chrA:123 - std::string start = std::string(seqstr,p + 1,p2 - p - 1); + std::string start = std::string(seqstr, p + 1, p2 - p - 1); - this->start = std::stoi( start ); + this->start = std::stoi(start); this->has_defined_end = true; this->end = this->start; } else if(p2 == (p + 1)) {// chrA:-123 - std::string end = std::string(seqstr,p2 + 1,strlen(seqstr) - p2 - 1); + std::string end = std::string(seqstr, p2 + 1, strlen(seqstr) - p2 - 1); this->start = 0; - this->end = std::stoi( end ) ; + this->end = std::stoi(end) ; this->has_defined_end = true; } else if(p2 > (p + 1)) { // chrA:123- | chrA:123-456 | chrA:123-456ERR if(p2 + 1 == strlen(seqstr)) { // chrA:123- - std::string start = std::string(seqstr,p + 1,p2 - p - 1); - + std::string start = std::string(seqstr, p + 1, p2 - p - 1); + this->start = std::stoi(start); this->has_defined_end = false; } else { // chrA:123-456 | chrA:123-456ERR - std::string start = std::string(seqstr,p + 1,p2 - p - 1); - std::string end = std::string(seqstr,p2 + 1,strlen(seqstr) - p2 - 1); + std::string start = std::string(seqstr, p + 1, p2 - p - 1); + std::string end = std::string(seqstr, p2 + 1, strlen(seqstr) - p2 - 1); - this->start = std::stoi( start ) ; + this->start = std::stoi(start) ; this->has_defined_end = true; - this->end = std::stoi( end ) ; + this->end = std::stoi(end) ; } } diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp index cf2dacfd..8d2857db 100644 --- a/test/sequenceregion/test_sequenceregion.cpp +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -31,50 +31,50 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) { char arg[] = "/seq/chr1"; sequence_region sr = sequence_region(&(arg[5])); - - BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); - BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end } { char arg[] = "/seq/chr1:"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); - BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end } { char arg[] = "/seq/chr1:123"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); - BOOST_CHECK_EQUAL(sr.start , 123); + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.start, 123); - BOOST_CHECK_EQUAL(sr.has_defined_end , true); - BOOST_CHECK_EQUAL(sr.end , 123); + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 123); } { char arg[] = "/seq/chr1:-123"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); - BOOST_CHECK_EQUAL(sr.start , 0); + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.start, 0); - BOOST_CHECK_EQUAL(sr.has_defined_end , true); - BOOST_CHECK_EQUAL(sr.end , 123); + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 123); } - + { char arg[] = "/seq/chr1:123-456"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); - BOOST_CHECK_EQUAL(sr.start , 123); + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.start, 123); - BOOST_CHECK_EQUAL(sr.has_defined_end , true); - BOOST_CHECK_EQUAL(sr.end , 456); + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 456); } @@ -82,10 +82,10 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) char arg[] = "/seq/chr1:123-"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chr1"); - BOOST_CHECK_EQUAL(sr.start , 123); + BOOST_CHECK_EQUAL(sr.seq_name, "chr1"); + BOOST_CHECK_EQUAL(sr.start, 123); - BOOST_CHECK_EQUAL(sr.has_defined_end , false); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); //BOOST_CHECK_EQUAL(sr.end , 456); - underfined } @@ -94,7 +94,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region) sequence_region *sr = nullptr; if(sr == nullptr) {// compiler doesn't understand this otherwise - BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])) , std::invalid_argument); + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); } } @@ -107,50 +107,50 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) { char arg[] = "/seq/chrRr1"; sequence_region sr = sequence_region(&(arg[5])); - - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end } { char arg[] = "/seq/chrRr1:"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end } { char arg[] = "/seq/chrRr1:1234"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.start , 1234); + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 1234); - BOOST_CHECK_EQUAL(sr.has_defined_end , true); - BOOST_CHECK_EQUAL(sr.end , 1234); + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 1234); } { char arg[] = "/seq/chrRr1:-1234"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.start , 0); + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 0); - BOOST_CHECK_EQUAL(sr.has_defined_end , true); - BOOST_CHECK_EQUAL(sr.end , 1234); + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 1234); } - + { char arg[] = "/seq/chrRr1:1234-1235"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.start , 1234); + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 1234); - BOOST_CHECK_EQUAL(sr.has_defined_end , true); - BOOST_CHECK_EQUAL(sr.end , 1235); + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 1235); } @@ -158,10 +158,10 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) char arg[] = "/seq/chrRr1:1234-"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.start , 1234); + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 1234); - BOOST_CHECK_EQUAL(sr.has_defined_end , false); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); //BOOST_CHECK_EQUAL(sr.end , 1235); - underfined } @@ -170,7 +170,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) sequence_region *sr = nullptr; if(sr == nullptr) {// compiler doesn't understand this otherwise - BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])) , std::invalid_argument); + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); } } @@ -184,50 +184,50 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) { char arg[] = "/seq/chrRr1"; sequence_region sr = sequence_region(&(arg[5])); - - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end } { char arg[] = "/seq/chrRr1:"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.has_defined_end , false); // not defined; sequence's end + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); // not defined; sequence's end } { char arg[] = "/seq/chrRr1:123"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.start , 123); + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 123); - BOOST_CHECK_EQUAL(sr.has_defined_end , true); - BOOST_CHECK_EQUAL(sr.end , 123); + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 123); } { char arg[] = "/seq/chrRr1:-123"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.start , 0); + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 0); - BOOST_CHECK_EQUAL(sr.has_defined_end , true); - BOOST_CHECK_EQUAL(sr.end , 123); + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 123); } - + { char arg[] = "/seq/chrRr1:123-456"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.start , 123); + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 123); - BOOST_CHECK_EQUAL(sr.has_defined_end , true); - BOOST_CHECK_EQUAL(sr.end , 456); + BOOST_CHECK_EQUAL(sr.has_defined_end, true); + BOOST_CHECK_EQUAL(sr.end, 456); } @@ -235,10 +235,10 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) char arg[] = "/seq/chrRr1:123-"; sequence_region sr = sequence_region(&(arg[5])); - BOOST_CHECK_EQUAL(sr.seq_name , "chrRr1"); - BOOST_CHECK_EQUAL(sr.start , 123); + BOOST_CHECK_EQUAL(sr.seq_name, "chrRr1"); + BOOST_CHECK_EQUAL(sr.start, 123); - BOOST_CHECK_EQUAL(sr.has_defined_end , false); + BOOST_CHECK_EQUAL(sr.has_defined_end, false); //BOOST_CHECK_EQUAL(sr.end , 456); - underfined } @@ -247,7 +247,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) sequence_region *sr = nullptr; if(sr == nullptr) {// compiler doesn't understand this otherwise - BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])) , std::invalid_argument); + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); } } From 370455b06928f107e5d7fff34cc2b7b71f47af44 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 21 Jan 2020 12:20:39 +0100 Subject: [PATCH 091/119] some refactoring --- include/fastafs.hpp | 9 +-- src/fastafs.cpp | 96 +++++++++++++++++++++----------- src/fuse.cpp | 4 +- src/ucsc2bit.cpp | 4 +- test/CMakeLists.txt | 14 ++--- test/cache/test_cache_twobit.cpp | 6 +- test/fastafs/test_fastafs.cpp | 47 +++++++++++++--- test/view/test_view.cpp | 60 ++++++++++---------- 8 files changed, 152 insertions(+), 88 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 97965bb0..0d5e9b01 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -74,8 +74,8 @@ class fastafs_seq uint32_t fasta_filesize(uint32_t padding); void view_fasta(ffs2f_init_seq*, std::ifstream *); - uint32_t view_fasta_chunk_cached(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); - template uint32_t view_fasta_chunk_cached_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); + uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); + template uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); std::string sha1(ffs2f_init_seq*, std::ifstream*);// sha1 works 'fine' but is, like md5, sensitive to length extension hacks and should actually not be used for identifiers. std::string md5(ffs2f_init_seq*, std::ifstream*);// md5 works 'fine' but is, like sha1, sensitive to length extension hacks and should actually not be used for identifiers. @@ -111,14 +111,15 @@ class fastafs fastafs_flags flags; - uint32_t n(); + uint32_t n();// number nucleotdies std::string basename(); void load(std::string); void view_fasta(ffs2f_init*); - uint32_t view_fasta_chunk_cached(ffs2f_init*, char*, size_t, off_t);//@todo remove _cached suffix + uint32_t view_sequence_region(ffs2f_init*, const char * , char*, size_t, off_t);// read stuff like "chr1:123-456" into the buffer + uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t); uint32_t view_faidx_chunk(uint32_t, char *, size_t, off_t); uint32_t view_ucsc2bit_chunk(char *, size_t, off_t); size_t view_dict_chunk(char *, size_t, off_t); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index c7a379b7..ddb99e3f 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -33,7 +33,7 @@ #include "twobit_byte.hpp" #include "fourbit_byte.hpp" #include "fastafs.hpp" -//#include "flags.hpp" +#include "sequence_region.hpp" #include "utils.hpp" @@ -72,12 +72,12 @@ void fastafs_seq::view_fasta(ffs2f_init_seq* cache, std::ifstream *fh) uint32_t offset = 0; //@todo figure out if a do {} while() loop isn't more in place here? - uint32_t written = this->view_fasta_chunk_cached(cache, buffer, READ_BUFFER_SIZE, offset, fh); + uint32_t written = this->view_fasta_chunk(cache, buffer, READ_BUFFER_SIZE, offset, fh); while(written > 0) { std::cout << std::string(buffer, written); offset += written; - written = this->view_fasta_chunk_cached(cache, buffer, READ_BUFFER_SIZE, offset, fh); + written = this->view_fasta_chunk(cache, buffer, READ_BUFFER_SIZE, offset, fh); } } @@ -135,7 +135,7 @@ ffs2f_init_seq* fastafs_seq::init_ffs2f_seq(const uint32_t padding_arg, bool all // @todo templating like stuff -uint32_t fastafs_seq::view_fasta_chunk_cached( +uint32_t fastafs_seq::view_fasta_chunk( ffs2f_init_seq* cache, char *buffer, @@ -145,16 +145,16 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( std::ifstream *fh) { if(this->flags.is_twobit()) { - return this->view_fasta_chunk_cached_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); + return this->view_fasta_chunk_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); } else { - return this->view_fasta_chunk_cached_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); + return this->view_fasta_chunk_generalized(cache, buffer, buffer_size, start_pos_in_fasta, fh); } } /* - * fastafs_seq::view_fasta_chunk_cached - + * fastafs_seq::view_fasta_chunk - * * @padding = number of spaces? * @char buffer = @@ -167,7 +167,7 @@ uint32_t fastafs_seq::view_fasta_chunk_cached( * * @todo see if this can be a std::ifstream or some kind of stream type of object? */ -template uint32_t fastafs_seq::view_fasta_chunk_cached_generalized( +template uint32_t fastafs_seq::view_fasta_chunk_generalized( ffs2f_init_seq* cache, char *buffer, @@ -369,7 +369,7 @@ std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) // half iteration remainder = this->n % chunk_size; if this number > 0; do it too for(uint32_t i = 0; i < n_iterations; i++) { - this->view_fasta_chunk_cached(cache, chunk, + this->view_fasta_chunk(cache, chunk, chunksize, header_offset + (i * chunksize), fh); @@ -377,7 +377,7 @@ std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) } if(remaining_bytes > 0) { - this->view_fasta_chunk_cached(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); + this->view_fasta_chunk(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); SHA1_Update(&ctx, chunk, remaining_bytes); //chunk[remaining_bytes] = '\0'; } @@ -421,7 +421,7 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) // half iteration remainder = this->n % chunk_size; if this number > 0; do it too for(uint32_t i = 0; i < n_iterations; i++) { - this->view_fasta_chunk_cached(cache, chunk, + this->view_fasta_chunk(cache, chunk, chunksize, header_offset + (i * chunksize), fh); @@ -429,7 +429,7 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) } if(remaining_bytes > 0) { - this->view_fasta_chunk_cached(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); + this->view_fasta_chunk(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); MD5_Update(&ctx, chunk, remaining_bytes); chunk[remaining_bytes] = '\0'; } @@ -470,15 +470,7 @@ uint32_t fastafs_seq::n_bits() } -/* -uint32_t fastafs_seq::n_bits() -{ - uint32_t n = this->n; - for(uint32_t i = 0; i < this->n_starts.size(); i++) { - n -= n_ends[i] - this->n_starts[i] + 1; - } - return (n + 3) / 4; -}*/ + //@brief calculates the number of paddings found in a sequence of length N with @@ -732,8 +724,48 @@ ffs2f_init* fastafs::init_ffs2f(uint32_t padding, bool allow_masking) return ddata; } + + + + +uint32_t fastafs::view_sequence_region(ffs2f_init* cache, const char *seq_region_arg , char *buffer, size_t buffer_size, off_t file_offset) { +#if DEBUG + if(cache == nullptr) { + throw std::invalid_argument("fastafs::view_sequence_region - error 01\n"); + } + + if(cache->padding_arg != 0) { + throw std::invalid_argument("fastafs::view_sequence_region - error 02\n"); + } + + if(cache->sequences.size() == 0) { + throw std::invalid_argument("fastafs::view_sequence_region - error 03\n"); + } +#endif + + + // parse "chr..:..-.." string + sequence_region sr = sequence_region(seq_region_arg); + std::cout << "[" << sr.seq_name << "]\n"; + + // 02 : check if 'chr' is equals this->data[i].name + //fastafs_seq *fsq = nullptr; + size_t i; + for(i = 0; i < this->data.size(); i++) { + if(sr.seq_name.compare(this->data[i]->name) == 0) { + return 4; //ffi->f->data[i]-> + } + } + + + return 0; +} + + + + /* - * fastafs::view_fasta_chunk_cached - + * fastafs::view_fasta_chunk - * * @cache: * @buffer: @@ -742,12 +774,7 @@ ffs2f_init* fastafs::init_ffs2f(uint32_t padding, bool allow_masking) * * returns */ -uint32_t fastafs::view_fasta_chunk_cached( - ffs2f_init* cache, - char *buffer, - - size_t buffer_size, - off_t file_offset) +uint32_t fastafs::view_fasta_chunk(ffs2f_init* cache, char *buffer, size_t buffer_size, off_t file_offset) { uint32_t written = 0; std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); @@ -761,7 +788,7 @@ uint32_t fastafs::view_fasta_chunk_cached( const uint32_t sequence_file_size = seq->fasta_filesize(cache->padding_arg); if(pos < sequence_file_size) { - const uint32_t written_seq = seq->view_fasta_chunk_cached( + const uint32_t written_seq = seq->view_fasta_chunk( cache->sequences[i], &buffer[written], std::min((uint32_t) buffer_size - written, sequence_file_size), @@ -783,7 +810,7 @@ uint32_t fastafs::view_fasta_chunk_cached( } file.close(); } else { - throw std::runtime_error("[fastafs::view_fasta_chunk_cached] could not load fastafs: " + this->filename); + throw std::runtime_error("[fastafs::view_fasta_chunk] could not load fastafs: " + this->filename); } return written; } @@ -1000,7 +1027,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi while(pos < pos_limit) { //printf("%i - %i = %i || %i\n",pos_limit,pos, (full_twobits - (pos_limit - pos)) * 4, j); //sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), 4, &file); - sequence->view_fasta_chunk_cached(cache->sequences[i], n_seq, 4, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), &file); + sequence->view_fasta_chunk(cache->sequences[i], n_seq, 4, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), &file); t.set(n_seq); buffer[written++] = t.data; pos++; @@ -1020,7 +1047,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi if(pos < pos_limit) { //printf("%i - %i = %i || %i :: %i == %i \n",pos_limit,pos, full_twobits * 4, j, sequence->n - (full_twobits * 4), sequence->n - j); //sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + full_twobits * 4, sequence->n - (full_twobits * 4), &file); - sequence->view_fasta_chunk_cached(cache->sequences[i], n_seq, sequence->n - (full_twobits * 4), sequence->name.size() + 2 + full_twobits * 4, &file); + sequence->view_fasta_chunk(cache->sequences[i], n_seq, sequence->n - (full_twobits * 4), sequence->name.size() + 2 + full_twobits * 4, &file); t.set(n_seq); buffer[written++] = t.data; pos++; @@ -1034,7 +1061,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi delete cache; file.close(); } else { - throw std::runtime_error("[fastafs::view_fasta_chunk_cached] could not load fastafs: " + this->filename); + throw std::runtime_error("[fastafs::view_fasta_chunk] could not load fastafs: " + this->filename); } return written; } @@ -1326,6 +1353,9 @@ uint32_t fastafs::view_faidx_chunk(uint32_t padding, char *buffer, size_t buffer return written; } + + + /* https://www.ebi.ac.uk/ena/cram/sha1/7716832754e642d068e6fbd8f792821ca5544309 Hello message sent diff --git a/src/fuse.cpp b/src/fuse.cpp index 0f2ffc28..b14157a2 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -204,7 +204,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st printf("?? [[%s]]\n", path); if(strcmp(path, virtual_fasta_filename.c_str()) == 0) { - written = (signed int) ffi->f->view_fasta_chunk_cached(ffi->cache, buffer, size, offset); + written = (signed int) ffi->f->view_fasta_chunk(ffi->cache, buffer, size, offset); } else if(strcmp(path, virtual_faidx_filename.c_str()) == 0) { written = (signed int) ffi->f->view_faidx_chunk(ffi->padding, buffer, size, offset); } else if(strcmp(path, virtual_ucsc2bit_filename.c_str()) == 0) { @@ -244,7 +244,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st total_requested_size = std::min(size, total_requested_size); printf("total requested length: %i\n", (int) total_requested_size); - written = (signed int) fsq->view_fasta_chunk_cached( + written = (signed int) fsq->view_fasta_chunk( ffi->cache_p0->sequences[i], // ffs2f_init_seq* cache, buffer, // char *buffer (size_t) total_requested_size, // size_t buffer_size, diff --git a/src/ucsc2bit.cpp b/src/ucsc2bit.cpp index ce26375b..e2d158c0 100644 --- a/src/ucsc2bit.cpp +++ b/src/ucsc2bit.cpp @@ -374,7 +374,7 @@ void ucsc2bit::load(std::string afilename) /* -* ucsc2bit::view_fasta_chunk_cached - +* ucsc2bit::view_fasta_chunk - * * @padding: size of padding - placement of newlines (default = 60) * @buffer: @@ -422,7 +422,7 @@ uint32_t ucsc2bit::view_fasta_chunk(uint32_t padding, char *buffer, size_t buffe file.close(); } else { - throw std::runtime_error("[ucsc2bit::view_fasta_chunk_cached] could not load ucsc2bit: " + this->filename); + throw std::runtime_error("[ucsc2bit::view_fasta_chunk] could not load ucsc2bit: " + this->filename); } return written; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6a17c0d7..d4d7736c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -20,14 +20,14 @@ set(BUILD_TEST_DIR "${BUILD_DIR}/test") add_executable(test_twobit_byte twobit_byte/test_twobit_byte.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) add_executable(test_fourbit_byte fourbit_byte/test_fourbit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_cache_twobit cache/test_cache_twobit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_cache_fourbit cache/test_cache_fourbit.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_cache_twobit cache/test_cache_twobit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_cache_fourbit cache/test_cache_fourbit.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) add_executable(test_flags flags/test_flags.cpp ../src/flags.cpp ../src/utils.cpp) -add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) -add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp) +add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) add_executable(test_sequenceregion sequenceregion/test_sequenceregion.cpp ../src/sequence_region.cpp) add_executable(test_utils utils/test_utils.cpp ../src/utils.cpp) #add_executable(test_tree tree/test_tree.cpp) diff --git a/test/cache/test_cache_twobit.cpp b/test/cache/test_cache_twobit.cpp index 8a9c8662..100b8c0e 100644 --- a/test/cache/test_cache_twobit.cpp +++ b/test/cache/test_cache_twobit.cpp @@ -386,7 +386,7 @@ BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) std::string output = ""; while(written < f2.fasta_filesize(padding)) { - w = f2.view_fasta_chunk_cached(cache_p60_uc, buffer, write_size, written); + w = f2.view_fasta_chunk(cache_p60_uc, buffer, write_size, written); output.append(buffer, w); written += w; } @@ -401,7 +401,7 @@ BOOST_AUTO_TEST_CASE(test_cache_forwards_backwards) output = ""; while(written < f2.fasta_filesize(padding)) { - w = f2.view_fasta_chunk_cached(cache_p60_mc, buffer, write_size, written); + w = f2.view_fasta_chunk(cache_p60_mc, buffer, write_size, written); output.append(buffer, w); written += w; } @@ -444,7 +444,7 @@ BOOST_AUTO_TEST_CASE(test_cache_with_newlines) std::string output = ""; while(written < f2.fasta_filesize(padding)) { - w = f2.view_fasta_chunk_cached(cache_p60, buffer, write_size, written); + w = f2.view_fasta_chunk(cache_p60, buffer, write_size, written); output.append(buffer, w); written += w; } diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index 5560b98d..e1edc3fa 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -61,11 +61,11 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size) uint32_t ret; char chunk[4]; for(uint32_t i = 0; i < 23; i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p100->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p100->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(ret, 1); } for(uint32_t i = 23; i < 23 + 5; i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p100->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p100->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(ret, 0); } @@ -75,7 +75,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size) std::string ref = ">chr1\nttttccccaaaagggg\n"; for(uint32_t i = 0; i < ref.size(); i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p23->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p23->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(chunk[0], ref[i]); // test for '>' BOOST_CHECK_EQUAL(ret, 1); } @@ -115,13 +115,13 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0) std::string ref = ">chr1\nttttccccaaaagggg\n"; for(uint32_t i = 0; i < ref.size(); i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p0->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(chunk[0], ref[i]); // test for '>' BOOST_CHECK_EQUAL(ret, 1); } // check if out of bound query returns 0 - ret = fs.data[0]->view_fasta_chunk_cached(cache_p0->sequences[0], chunk, 1, ref.size(), &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, ref.size(), &file); BOOST_CHECK_EQUAL(ret, 0); file.close(); @@ -156,13 +156,13 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_fastafile_size_padding_0__no_masking) std::string ref = ">chr1\nTTTTCCCCAAAAGGGG\n"; for(uint32_t i = 0; i < ref.size(); i++) { - ret = fs.data[0]->view_fasta_chunk_cached(cache_p0->sequences[0], chunk, 1, i, &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, i, &file); BOOST_CHECK_EQUAL(chunk[0], ref[i]); // test for '>' BOOST_CHECK_EQUAL(ret, 1); } // check if out of bound query returns 0 - ret = fs.data[0]->view_fasta_chunk_cached(cache_p0->sequences[0], chunk, 1, ref.size(), &file); + ret = fs.data[0]->view_fasta_chunk(cache_p0->sequences[0], chunk, 1, ref.size(), &file); BOOST_CHECK_EQUAL(ret, 0); file.close(); @@ -380,6 +380,39 @@ BOOST_AUTO_TEST_CASE(test_fastafs__dict_virtualization) // } } +/** + * @description tests reading a request like "chr1:123-456" etc. + */ +BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) +{ + std::string fastafs_file = "tmp/test.fastafs"; + fasta_to_twobit_fastafs("test/data/test.fa", fastafs_file); + + fastafs fs = fastafs("test"); + fs.load(fastafs_file); + + BOOST_REQUIRE(fs.data.size() > 0); + + + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:0-4"; + + + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 4); +} + diff --git a/test/view/test_view.cpp b/test/view/test_view.cpp index bb5255fe..64237da2 100644 --- a/test/view/test_view.cpp +++ b/test/view/test_view.cpp @@ -202,7 +202,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) // padding: 4 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 0); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 0); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG @@ -211,7 +211,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 999 - longer than longest seq - written = fs.view_fasta_chunk_cached(cache_p999, buffer, 100, 0); + written = fs.view_fasta_chunk(cache_p999, buffer, 100, 0); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTTCCCCAAAAGGGG >chr2 ACTGACTGNNNNACTG >chr3.1 ACTGACTGAAAAC >chr3.2 ACTGACTGAAAACC >chr3.3 ACTGACTGAAAACCC >chr4 ACTGNNNN >chr5 NNACTG @@ -220,7 +220,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 5 - see if 2bit works - written = fs.view_fasta_chunk_cached(cache_p5, buffer, 100, 0); + written = fs.view_fasta_chunk(cache_p5, buffer, 100, 0); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTTC CCCAA AAGGG G >chr2 ACTGA CTGNN NNACT G >chr3.1 ACTGA CTGAA AAC >chr3.2 ACTGA CTGAA AACC >chr3.3 ACTGA CTGAA AACCC >chr4 ACTGN NNN >chr5 NNACT G @@ -229,7 +229,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 0); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 0); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -238,7 +238,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 1 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 1); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 1); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -247,7 +247,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 2 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 2); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 2); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -256,7 +256,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 3 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 3); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 3); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -265,7 +265,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 4 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 4); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 4); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -274,7 +274,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 1, offset 5 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 100, 5); + written = fs.view_fasta_chunk(cache_p1, buffer, 100, 5); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 T T T T C C C C A A A A G G G G >chr2 A C T G A C T G N N N N A C T G >chr3.1 A C T G A C T G A A A A C >chr3.2 A C T G A C T G A A A A C C >chr3.3 A C T G A C T G A A A A C C C >chr4 A C T G N N N N >chr5 N N A C T G @@ -283,7 +283,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 6 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 6); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 6); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG @@ -292,7 +292,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 7 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 7); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 7); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG @@ -301,7 +301,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 8 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 8); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 8); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG @@ -310,7 +310,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 9 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 9); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 9); BOOST_CHECK_EQUAL(written, 100); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG @@ -319,7 +319,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) flush_buffer(buffer, 100, '?'); // padding: 4, offset: 10 - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, 10); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, 10); std_buffer = std::string(buffer, 100); //>chr1 TTTT CCCC AAAA GGGG >chr2 ACTG ACTG NNNN ACTG >chr3.1 ACTG ACTG AAAA C >chr3.2 ACTG ACTG AAAA CC >chr3.3 ACTG ACTG AAAA CCC >chr4 ACTG NNNN >chr5 NNAC TG //XXXXXXXXXX----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----|----.----| @@ -332,7 +332,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing) for(uint32_t offset = 0; offset < 62; ++offset) { std::string substr_file = full_file.substr(offset, 100); - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 100, offset); + written = fs.view_fasta_chunk(cache_p4, buffer, 100, offset); std_buffer = std::string(buffer, substr_file.size()); BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << offset); @@ -376,7 +376,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_sub) //[>] [c] [h] [r] [3] [.] [1] [\n] [A] [C] [T] [G] [A] [C] [T] [G] [A] [A] [A] [A] [C] [\n] BOOST_CHECK_EQUAL(fs.data[2]->fasta_filesize(100), 22); - written = fs.data[2]->view_fasta_chunk_cached(cache_p100->sequences[2], buffer, 100, 0, &fh); + written = fs.data[2]->view_fasta_chunk(cache_p100->sequences[2], buffer, 100, 0, &fh); BOOST_CHECK_EQUAL(written, 22); std::string std_buffer = std::string(buffer, written); @@ -446,7 +446,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing2) for(uint32_t buffer_len = (uint32_t) full_file.size() - start_pos; buffer_len > 0; buffer_len--) { std::string substr_file = std::string(full_file, start_pos, buffer_len); - written = fs.view_fasta_chunk_cached(cache, buffer, buffer_len, start_pos); + written = fs.view_fasta_chunk(cache, buffer, buffer_len, start_pos); std_buffer = std::string(buffer, substr_file.size()); BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << start_pos << " and of length: " << buffer_len); BOOST_CHECK_EQUAL_MESSAGE(std_buffer.compare(substr_file), 0, "Difference in content for offset=" << start_pos << " and of length: " << buffer_len); @@ -500,63 +500,63 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) // padding = 32, offset = 0 - written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 0); + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 0); BOOST_CHECK_EQUAL(written, 98); std_buffer = std::string(buffer, 98); BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 32, offset = 1 - written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 1); + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 1); BOOST_CHECK_EQUAL(written, 97); std_buffer = std::string(buffer, 97); BOOST_CHECK_EQUAL(std_buffer.compare("IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 32, offset = 2 - written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 2); + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 2); BOOST_CHECK_EQUAL(written, 96); std_buffer = std::string(buffer, 96); BOOST_CHECK_EQUAL(std_buffer.compare("UPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 32, offset = 5 - written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 5); + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 5); BOOST_CHECK_EQUAL(written, 93); std_buffer = std::string(buffer, 93); BOOST_CHECK_EQUAL(std_buffer.compare("C\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 32, offset = 6 - written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 6); + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 6); BOOST_CHECK_EQUAL(written, 92); std_buffer = std::string(buffer, 92); BOOST_CHECK_EQUAL(std_buffer.compare("\nNBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 32, offset = 7 - written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 7); + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 7); BOOST_CHECK_EQUAL(written, 91); std_buffer = std::string(buffer, 91); BOOST_CHECK_EQUAL(std_buffer.compare("NBKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 32, offset = 8 - written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 8); + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 8); BOOST_CHECK_EQUAL(written, 90); std_buffer = std::string(buffer, 90); BOOST_CHECK_EQUAL(std_buffer.compare("BKAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 32, offset = 9 - written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 9); + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 9); BOOST_CHECK_EQUAL(written, 89); std_buffer = std::string(buffer, 89); BOOST_CHECK_EQUAL(std_buffer.compare("KAHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 32, offset = 10 - written = fs.view_fasta_chunk_cached(cache_p32, buffer, 200, 10); + written = fs.view_fasta_chunk(cache_p32, buffer, 200, 10); BOOST_CHECK_EQUAL(written, 88); std_buffer = std::string(buffer, 88); BOOST_CHECK_EQUAL(std_buffer.compare("AHMDCUWGSYVTRHGWVUMTBSDN-----\n-----BGYADNHSMUTRCKWVsbhvdnrtgyc\nmkwuaAVTSDKNB---UGWMHYRC\n"), 0); @@ -564,21 +564,21 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) // padding = 1, offset = 0 - written = fs.view_fasta_chunk_cached(cache_p1, buffer, 200, 0); + written = fs.view_fasta_chunk(cache_p1, buffer, 200, 0); BOOST_CHECK_EQUAL(written, 183); std_buffer = std::string(buffer, 183); BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nN\nB\nK\nA\nH\nM\nD\nC\nU\nW\nG\nS\nY\nV\nT\nR\nH\nG\nW\nV\nU\nM\nT\nB\nS\nD\nN\n-\n-\n-\n-\n-\n-\n-\n-\n-\n-\nB\nG\nY\nA\nD\nN\nH\nS\nM\nU\nT\nR\nC\nK\nW\nV\ns\nb\nh\nv\nd\nn\nr\nt\ng\ny\nc\nm\nk\nw\nu\na\nA\nV\nT\nS\nD\nK\nN\nB\n-\n-\n-\nU\nG\nW\nM\nH\nY\nR\nC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 5, offset = 0 - written = fs.view_fasta_chunk_cached(cache_p5, buffer, 200, 0); + written = fs.view_fasta_chunk(cache_p5, buffer, 200, 0); BOOST_CHECK_EQUAL(written, 113); std_buffer = std::string(buffer, 113); BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAH\nMDCUW\nGSYVT\nRHGWV\nUMTBS\nDN---\n-----\n--BGY\nADNHS\nMUTRC\nKWVsb\nhvdnr\ntgycm\nkwuaA\nVTSDK\nNB---\nUGWMH\nYRC\n"), 0); flush_buffer(buffer, 200, '?'); // padding = 999, offset = 0 - written = fs.view_fasta_chunk_cached(cache_p999, buffer, 200, 0); + written = fs.view_fasta_chunk(cache_p999, buffer, 200, 0); BOOST_CHECK_EQUAL(written, 96); std_buffer = std::string(buffer, 96); BOOST_CHECK_EQUAL(std_buffer.compare(">IUPAC\nNBKAHMDCUWGSYVTRHGWVUMTBSDN----------BGYADNHSMUTRCKWVsbhvdnrtgycmkwuaAVTSDKNB---UGWMHYRC\n"), 0); @@ -590,7 +590,7 @@ BOOST_AUTO_TEST_CASE(test_chunked_viewing_fourbit) for(uint32_t offset = 0; offset < 62; ++offset) { std::string substr_file = full_file.substr(offset, 200); - written = fs.view_fasta_chunk_cached(cache_p4, buffer, 200, offset); + written = fs.view_fasta_chunk(cache_p4, buffer, 200, offset); std_buffer = std::string(buffer, substr_file.size()); BOOST_CHECK_EQUAL_MESSAGE(written, substr_file.size(), "Difference in size for size=" << substr_file.size() << " [found=" << written << "] for offset=" << offset); From eb413ee865f244e38f3718eb03af3080debbb91f Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 21 Jan 2020 12:54:36 +0100 Subject: [PATCH 092/119] sav --- include/fastafs.hpp | 2 + include/sequence_region.hpp | 12 +++ src/fastafs.cpp | 70 +++++++++++++++--- test/fastafs/test_fastafs.cpp | 134 +++++++++++++++++++++++++++++++--- 4 files changed, 196 insertions(+), 22 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 0d5e9b01..61d1c2ae 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -12,6 +12,7 @@ #include "utils.hpp" +#include "sequence_region.hpp" #include "flags.hpp" @@ -74,6 +75,7 @@ class fastafs_seq uint32_t fasta_filesize(uint32_t padding); void view_fasta(ffs2f_init_seq*, std::ifstream *); + uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region* , char *, size_t, off_t, std::ifstream *); uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); template uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); diff --git a/include/sequence_region.hpp b/include/sequence_region.hpp index 15dd6718..cb86ed87 100644 --- a/include/sequence_region.hpp +++ b/include/sequence_region.hpp @@ -1,3 +1,10 @@ + + + +#ifndef SEQUENCE_REGION_HPP +#define SEQUENCE_REGION_HPP + + #include #include #include @@ -29,3 +36,8 @@ class sequence_region void parse(const char *); }; + + +#endif + + diff --git a/src/fastafs.cpp b/src/fastafs.cpp index ddb99e3f..1d9073ba 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -33,7 +33,6 @@ #include "twobit_byte.hpp" #include "fourbit_byte.hpp" #include "fastafs.hpp" -#include "sequence_region.hpp" #include "utils.hpp" @@ -320,6 +319,55 @@ template uint32_t fastafs_seq::view_fasta_chunk_generalized( + +uint32_t fastafs_seq::view_sequence_region(ffs2f_init_seq* cache, sequence_region* sr, char *buffer, size_t size, off_t offset , std::ifstream *fh) { +#if DEBUG + if(cache == nullptr) { + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 01\n"); + } + + if(sr == nullptr) { + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 02\n"); + } + + if(size == 0) { // requestedsize must be larger than 0 + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 03\n"); + } + +#endif + + uint32_t written = 0; + + size_t total_requested_size; + if(sr->has_defined_end) { + total_requested_size = sr->end + 1; + printf("----------- AAAAA\ntotal_requested_size must be 0: %i\n",total_requested_size); + } else { + total_requested_size = this->n; + printf("----------- BBBBB\ntotal_requested_size must be 0: %i\n",total_requested_size); + } + + total_requested_size -= sr->start; + + printf("total_requested_size must be 0: %i\n",total_requested_size); + + total_requested_size = std::min(size, total_requested_size); + + printf("total_requested_size must be 0: %i\n",total_requested_size); + + written = (uint32_t) this->view_fasta_chunk( + cache, // ffs2f_init_seq* cache, + buffer, // char *buffer + (size_t) total_requested_size, // size_t buffer_size, + (off_t) 2 + this->name.size() + sr->start + offset, // offset is for chunked reading + fh + ); + + return written; +} + + + /* CRAM specification: @@ -743,21 +791,19 @@ uint32_t fastafs::view_sequence_region(ffs2f_init* cache, const char *seq_region } #endif + std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + if(file.is_open()) { + // parse "chr..:..-.." string + sequence_region sr = sequence_region(seq_region_arg); - // parse "chr..:..-.." string - sequence_region sr = sequence_region(seq_region_arg); - std::cout << "[" << sr.seq_name << "]\n"; - - // 02 : check if 'chr' is equals this->data[i].name - //fastafs_seq *fsq = nullptr; - size_t i; - for(i = 0; i < this->data.size(); i++) { - if(sr.seq_name.compare(this->data[i]->name) == 0) { - return 4; //ffi->f->data[i]-> + // 02 : check if 'chr' is equals this->data[i].name + for(size_t i = 0; i < this->data.size(); i++) { + if(sr.seq_name.compare(this->data[i]->name) == 0) { + return this->data[i]->view_sequence_region(cache->sequences[i], &sr, buffer, buffer_size, file_offset , &file); + } } } - return 0; } diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index e1edc3fa..da5f2bcf 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -393,24 +393,138 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) BOOST_REQUIRE(fs.data.size() > 0); + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:0"; - ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence - const char arg[] = "/seq/chr2:0-4"; - + size_t written; + char *buffer; + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - size_t written; - char *buffer; - // for buffer size ... { - buffer = new char[READ_BUFFER_SIZE + 1]; - flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 't'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:3"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 't'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:4"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'c'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:15"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'g'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr1:16"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + BOOST_CHECK_EQUAL(written, 0); + BOOST_CHECK_EQUAL(buffer[0], '\n'); + } + + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:0"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:1"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'C'); + } + - BOOST_CHECK_EQUAL(written, 4); } From 4ea9335550b1c3c6ec7f0ebfd1625292b0b8506b Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 21 Jan 2020 14:16:59 +0100 Subject: [PATCH 093/119] seems to work --- src/fastafs.cpp | 9 +- test/fastafs/test_fastafs.cpp | 150 +++++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 10 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 1d9073ba..a697e998 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -340,21 +340,14 @@ uint32_t fastafs_seq::view_sequence_region(ffs2f_init_seq* cache, sequence_regio size_t total_requested_size; if(sr->has_defined_end) { - total_requested_size = sr->end + 1; - printf("----------- AAAAA\ntotal_requested_size must be 0: %i\n",total_requested_size); + total_requested_size = std::min((size_t) this->n, (size_t) sr->end + 1); } else { total_requested_size = this->n; - printf("----------- BBBBB\ntotal_requested_size must be 0: %i\n",total_requested_size); } total_requested_size -= sr->start; - - printf("total_requested_size must be 0: %i\n",total_requested_size); - total_requested_size = std::min(size, total_requested_size); - printf("total_requested_size must be 0: %i\n",total_requested_size); - written = (uint32_t) this->view_fasta_chunk( cache, // ffs2f_init_seq* cache, buffer, // char *buffer diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index da5f2bcf..bf4142d5 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -480,11 +480,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); BOOST_CHECK_EQUAL(written, 0); - BOOST_CHECK_EQUAL(buffer[0], '\n'); + //BOOST_CHECK_EQUAL(buffer[0], '\n'); } - { ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence const char arg[] = "/seq/chr2:0"; @@ -522,6 +521,153 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'C'); } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:7"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'G'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:8"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:9"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:10"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:11"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:12"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:15"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + + BOOST_CHECK_EQUAL(written, 1); + BOOST_CHECK_EQUAL(buffer[0], 'G'); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:16"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + BOOST_CHECK_EQUAL(written, 0); + //BOOST_CHECK_EQUAL(buffer[0], 'G'); + } From 64ae40e3cdeeaedb09ba9485cad9a900bce01549 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 21 Jan 2020 14:22:38 +0100 Subject: [PATCH 094/119] many more additional tests --- test/fastafs/test_fastafs.cpp | 64 +++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index bf4142d5..7c7492e8 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -669,6 +669,70 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) //BOOST_CHECK_EQUAL(buffer[0], 'G'); } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr4"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + BOOST_CHECK_EQUAL(written, 8); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + BOOST_CHECK_EQUAL(buffer[1], 'C'); + BOOST_CHECK_EQUAL(buffer[2], 'T'); + BOOST_CHECK_EQUAL(buffer[3], 'G'); + BOOST_CHECK_EQUAL(buffer[4], 'n'); + BOOST_CHECK_EQUAL(buffer[5], 'n'); + BOOST_CHECK_EQUAL(buffer[6], 'n'); + BOOST_CHECK_EQUAL(buffer[7], 'n'); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr4:4-"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(buffer[0], 'n'); + BOOST_CHECK_EQUAL(buffer[1], 'n'); + BOOST_CHECK_EQUAL(buffer[2], 'n'); + BOOST_CHECK_EQUAL(buffer[3], 'n'); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr3.1:1-2"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'C'); + BOOST_CHECK_EQUAL(buffer[1], 'T'); + } + } From 798afd03c468e73160ffc71967ab0d376fc0e307 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 21 Jan 2020 14:29:31 +0100 Subject: [PATCH 095/119] another test passed --- src/fastafs.cpp | 1 + test/fastafs/test_fastafs.cpp | 41 +++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index a697e998..d79ed499 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -346,6 +346,7 @@ uint32_t fastafs_seq::view_sequence_region(ffs2f_init_seq* cache, sequence_regio } total_requested_size -= sr->start; + total_requested_size -= offset; total_requested_size = std::min(size, total_requested_size); written = (uint32_t) this->view_fasta_chunk( diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index 7c7492e8..93b6030c 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -735,6 +735,47 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr3.3"; + + size_t written; + char *buffer; + + // for buffer size ... { + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 0); // small buffer size + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + BOOST_CHECK_EQUAL(buffer[1], 'C'); + BOOST_CHECK_EQUAL(buffer[2], 'T'); + BOOST_CHECK_EQUAL(buffer[3], 'G'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 4); // small buffer size + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + BOOST_CHECK_EQUAL(buffer[1], 'C'); + BOOST_CHECK_EQUAL(buffer[2], 'T'); + BOOST_CHECK_EQUAL(buffer[3], 'G'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 8); // small buffer size + BOOST_CHECK_EQUAL(written, 4); + BOOST_CHECK_EQUAL(buffer[0], 'a'); + BOOST_CHECK_EQUAL(buffer[1], 'a'); + BOOST_CHECK_EQUAL(buffer[2], 'a'); + BOOST_CHECK_EQUAL(buffer[3], 'a'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 12); // small buffer size + BOOST_CHECK_EQUAL(written, 3); + BOOST_CHECK_EQUAL(buffer[0], 'c'); + BOOST_CHECK_EQUAL(buffer[1], 'c'); + BOOST_CHECK_EQUAL(buffer[2], 'c'); + } + + + } From db2bc1d54a942599a3becda7286738a2d8b89199 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 21 Jan 2020 14:42:50 +0100 Subject: [PATCH 096/119] sav --- test/fastafs/test_fastafs.cpp | 71 +++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index 93b6030c..0530869e 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -400,7 +400,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -418,7 +417,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -436,7 +434,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -454,7 +451,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -472,7 +468,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -491,7 +486,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -510,7 +504,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -528,7 +521,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -547,7 +539,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -565,7 +556,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -583,7 +573,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -601,7 +590,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -620,7 +608,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -639,7 +626,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -659,7 +645,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -676,7 +661,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -701,7 +685,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -715,6 +698,26 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr4:-1"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'A'); + BOOST_CHECK_EQUAL(buffer[1], 'C'); + //BOOST_CHECK_EQUAL(buffer[2], 'T'); + //BOOST_CHECK_EQUAL(buffer[3], 'G'); + } + + { ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence const char arg[] = "/seq/chr3.1:1-2"; @@ -722,7 +725,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -742,7 +744,6 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) size_t written; char *buffer; - // for buffer size ... { buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); @@ -775,6 +776,38 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr5:2-5"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 2); // small buffer size + + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'T'); + BOOST_CHECK_EQUAL(buffer[1], 'G'); + } + + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chrDOESNOTEXIST"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); // small buffer size + + BOOST_CHECK_EQUAL(written, 0); + } } From 65cf5d7fa5c58472ce2e8abf2fdf2b07755add47 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 21 Jan 2020 14:54:35 +0100 Subject: [PATCH 097/119] sav --- include/fastafs.hpp | 4 ++-- src/fastafs.cpp | 20 ++++++++++-------- src/fuse.cpp | 50 +-------------------------------------------- 3 files changed, 14 insertions(+), 60 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 61d1c2ae..00a0f0db 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -75,7 +75,7 @@ class fastafs_seq uint32_t fasta_filesize(uint32_t padding); void view_fasta(ffs2f_init_seq*, std::ifstream *); - uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region* , char *, size_t, off_t, std::ifstream *); + uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region*, char *, size_t, off_t, std::ifstream *); uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); template uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); @@ -120,7 +120,7 @@ class fastafs void load(std::string); void view_fasta(ffs2f_init*); - uint32_t view_sequence_region(ffs2f_init*, const char * , char*, size_t, off_t);// read stuff like "chr1:123-456" into the buffer + uint32_t view_sequence_region(ffs2f_init*, const char *, char*, size_t, off_t); // read stuff like "chr1:123-456" into the buffer uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t); uint32_t view_faidx_chunk(uint32_t, char *, size_t, off_t); uint32_t view_ucsc2bit_chunk(char *, size_t, off_t); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index d79ed499..55e8c624 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -320,7 +320,8 @@ template uint32_t fastafs_seq::view_fasta_chunk_generalized( -uint32_t fastafs_seq::view_sequence_region(ffs2f_init_seq* cache, sequence_region* sr, char *buffer, size_t size, off_t offset , std::ifstream *fh) { +uint32_t fastafs_seq::view_sequence_region(ffs2f_init_seq* cache, sequence_region* sr, char *buffer, size_t size, off_t offset, std::ifstream *fh) +{ #if DEBUG if(cache == nullptr) { throw std::invalid_argument("fastafs_seq::view_sequence_region - error 01\n"); @@ -412,9 +413,9 @@ std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) // half iteration remainder = this->n % chunk_size; if this number > 0; do it too for(uint32_t i = 0; i < n_iterations; i++) { this->view_fasta_chunk(cache, chunk, - chunksize, - header_offset + (i * chunksize), - fh); + chunksize, + header_offset + (i * chunksize), + fh); SHA1_Update(&ctx, chunk, chunksize); } @@ -464,9 +465,9 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) // half iteration remainder = this->n % chunk_size; if this number > 0; do it too for(uint32_t i = 0; i < n_iterations; i++) { this->view_fasta_chunk(cache, chunk, - chunksize, - header_offset + (i * chunksize), - fh); + chunksize, + header_offset + (i * chunksize), + fh); MD5_Update(&ctx, chunk, chunksize); } @@ -770,7 +771,8 @@ ffs2f_init* fastafs::init_ffs2f(uint32_t padding, bool allow_masking) -uint32_t fastafs::view_sequence_region(ffs2f_init* cache, const char *seq_region_arg , char *buffer, size_t buffer_size, off_t file_offset) { +uint32_t fastafs::view_sequence_region(ffs2f_init* cache, const char *seq_region_arg, char *buffer, size_t buffer_size, off_t file_offset) +{ #if DEBUG if(cache == nullptr) { throw std::invalid_argument("fastafs::view_sequence_region - error 01\n"); @@ -793,7 +795,7 @@ uint32_t fastafs::view_sequence_region(ffs2f_init* cache, const char *seq_region // 02 : check if 'chr' is equals this->data[i].name for(size_t i = 0; i < this->data.size(); i++) { if(sr.seq_name.compare(this->data[i]->name) == 0) { - return this->data[i]->view_sequence_region(cache->sequences[i], &sr, buffer, buffer_size, file_offset , &file); + return this->data[i]->view_sequence_region(cache->sequences[i], &sr, buffer, buffer_size, file_offset, &file); } } } diff --git a/src/fuse.cpp b/src/fuse.cpp index b14157a2..8f209b14 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -212,55 +212,7 @@ static int do_read(const char *path, char *buffer, size_t size, off_t offset, st } else if(strcmp(path, virtual_dict_filename.c_str()) == 0) { written = (signed int) ffi->f->view_dict_chunk(buffer, size, offset); } else if(strncmp(path, "/seq/", 5) == 0) { // api access - // parse "chr..:..-.." string - sequence_region sr = sequence_region((strchr(path, '/') + 5)); - std::cout << "[" << sr.seq_name << "]\n"; - - // 02 : check if 'chr' is equals this->data[i].name - fastafs_seq *fsq = nullptr; - size_t i; - for(i = 0; i < ffi->f->data.size() && fsq == nullptr; i++) { - if(sr.seq_name.compare(ffi->f->data[i]->name) == 0) { - fsq = ffi->f->data[i]; - } - } - - // 03 - if chr was found , ok, otherise, not ok - if(fsq != nullptr) { - // code below seems to work, but copying to buf doesn't seem to work? - std::ifstream file(ffi->f->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); - if(file.is_open()) { - size_t total_requested_size; - if(sr.has_defined_end) { - total_requested_size = sr.end + 1; - } else { - total_requested_size = fsq->n; - } - printf("total requested length: %i\n", (int) total_requested_size); - - total_requested_size -= sr.start; - printf("total requested length: %i\n", (int) total_requested_size); - - total_requested_size = std::min(size, total_requested_size); - printf("total requested length: %i\n", (int) total_requested_size); - - written = (signed int) fsq->view_fasta_chunk( - ffi->cache_p0->sequences[i], // ffs2f_init_seq* cache, - buffer, // char *buffer - (size_t) total_requested_size, // size_t buffer_size, - (off_t) 2 + fsq->name.size() + sr.start, // off_t start_pos_in_fasta, - &file // std::ifstream *fh) - ); - - for(int kk = 0; kk < written ; kk++) { - printf("%c", buffer[kk]); - } - - } - file.close(); - } else { - // should return exit code of not 0 - } + written = (signed int) ffi->f->view_sequence_region(ffi->cache_p0, (strchr(path, '/') + 5), buffer, size, offset); } } else { if(ffi->u2b != nullptr) { From 18dee69d3bdad21b3715e7fd0f2b8b103164e844 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 22 Jan 2020 17:24:39 +0100 Subject: [PATCH 098/119] sav --- CMakeLists.txt | 1 + include/fastafs.hpp | 4 +-- src/fastafs.cpp | 47 ++++++++++++++++++++--------------- src/main.cpp | 4 +-- test/CMakeLists.txt | 3 +++ test/fastafs/test_fastafs.cpp | 2 +- 6 files changed, 36 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 357e221c..681e5fbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -178,6 +178,7 @@ add_test(test_cache_fourbit "${BUILD_TEST_DIR}/test_cache_fourbit") #add_test(test_view "${BUILD_TEST_DIR}/test_view") add_test(test_flags "${BUILD_TEST_DIR}/test_flags") add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs") +add_test(test_check "${BUILD_TEST_DIR}/test_check") # file integrity checks add_test(test_fastafs_as_ucsc2bit "${BUILD_TEST_DIR}/test_fastafs_as_ucsc2bit") add_test(test_ucsc2bit_to_fastafs "${BUILD_TEST_DIR}/test_ucsc2bit_to_fastafs") add_test(test_ucsc2bit_as_fasta "${BUILD_TEST_DIR}/test_ucsc2bit_as_fasta") diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 00a0f0db..d27b568f 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -135,8 +135,8 @@ class fastafs std::string get_faidx(uint32_t);//@todo get rid of this, make it full chunked int info(bool); - bool check_file_integrity(void); - bool check_sequence_integrity(void); + bool check_file_integrity(bool); + bool check_sequence_integrity(bool); }; diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 55e8c624..b73988b9 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1574,7 +1574,7 @@ uint32_t fastafs::get_crc32(void) //true = integer //false = corrupt -bool fastafs::check_file_integrity() +bool fastafs::check_file_integrity(bool verbose) { uint32_t crc32_current = this->get_crc32(); @@ -1586,23 +1586,26 @@ bool fastafs::check_file_integrity() char buf_new[5] = "\x00\x00\x00\x00"; uint_to_fourbytes(buf_new, (uint32_t) crc32_current); - printf("ERROR\t%02hhx%02hhx%02hhx%02hhx (in-file) != %02hhx%02hhx%02hhx%02hhx (actual file)\n--\n", - (unsigned char) buf_old[0], - (unsigned char) buf_old[1], - (unsigned char) buf_old[2], - (unsigned char) buf_old[3], - - (unsigned char) buf_new[0], - (unsigned char) buf_new[1], - (unsigned char) buf_new[2], - (unsigned char) buf_new[3]); - + if(verbose) { + printf("ERROR\t%02hhx%02hhx%02hhx%02hhx (in-file) != %02hhx%02hhx%02hhx%02hhx (actual file)\n--\n", + (unsigned char) buf_old[0], + (unsigned char) buf_old[1], + (unsigned char) buf_old[2], + (unsigned char) buf_old[3], + + (unsigned char) buf_new[0], + (unsigned char) buf_new[1], + (unsigned char) buf_new[2], + (unsigned char) buf_new[3]); + } } else { - printf("OK\t%02hhx%02hhx%02hhx%02hhx\n--\n", - (unsigned char) buf_old[0], - (unsigned char) buf_old[1], - (unsigned char) buf_old[2], - (unsigned char) buf_old[3]); + if(verbose) { + printf("OK\t%02hhx%02hhx%02hhx%02hhx\n--\n", + (unsigned char) buf_old[0], + (unsigned char) buf_old[1], + (unsigned char) buf_old[2], + (unsigned char) buf_old[3]); + } } return (crc32_current == this->crc32f); @@ -1611,7 +1614,7 @@ bool fastafs::check_file_integrity() //true = integer //false = corrupt -bool fastafs::check_sequence_integrity() +bool fastafs::check_sequence_integrity(bool verbose) { if(this->filename.size() == 0) { throw std::invalid_argument("No filename found"); @@ -1633,9 +1636,13 @@ bool fastafs::check_sequence_integrity() std::string new_hash = this->data[i]->md5(cache->sequences[i], &file); if(old_hash.compare(new_hash) == 0) { - printf("OK\t%s\n", this->data[i]->name.c_str()); + if(verbose) { + printf("OK\t%s\n", this->data[i]->name.c_str()); + } } else { - printf("ERROR\t%s\t%s != %s\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); + if(verbose) { + printf("ERROR\t%s\t%s != %s\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); + } retcode = false; } } diff --git a/src/main.cpp b/src/main.cpp index 02e75c35..d21186bf 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -253,8 +253,8 @@ int main(int argc, char *argv[]) fastafs f = fastafs(std::string(argv[argc - 1])); f.load(fname); - bool check1 = f.check_file_integrity(); - bool check2 = f.check_sequence_integrity(); + bool check1 = f.check_file_integrity(true); + bool check2 = f.check_sequence_integrity(true); if(check1 and check2) { return 0; } else { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index d4d7736c..d3604c30 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -25,6 +25,7 @@ add_executable(test_cache_fourbit cache/test_cache_fourbit.cpp ../src/fast add_executable(test_view view/test_view.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/fasta_to_fourbit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) add_executable(test_flags flags/test_flags.cpp ../src/flags.cpp ../src/utils.cpp) add_executable(test_fastafs fastafs/test_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) +add_executable(test_check check/test_check.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) add_executable(test_fastafs_as_ucsc2bit fastafs/test_ucsc2bit.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) add_executable(test_ucsc2bit_to_fastafs ucsc2bit_to_fastafs/test_ucsc2bit_to_fastafs.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/ucsc2bit_to_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) add_executable(test_ucsc2bit_as_fasta ucsc2bit/test_ucsc2bit_as_fasta.cpp ../src/fasta_to_twobit_fastafs.cpp ../src/flags.cpp ../src/fastafs.cpp ../src/ucsc2bit.cpp ../src/twobit_byte.cpp ../src/fourbit_byte.cpp ../src/utils.cpp ../src/sequence_region.cpp) @@ -45,6 +46,8 @@ set_target_properties(test_flags PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") +set_target_properties(test_check + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_fastafs_as_ucsc2bit PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_TEST_DIR}") set_target_properties(test_twobit_byte diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index 0530869e..dbe20f54 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -235,7 +235,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1b) std::ifstream file(fs.filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); BOOST_REQUIRE(file.is_open()); - BOOST_CHECK_EQUAL(fs.check_sequence_integrity(), true); + BOOST_CHECK_EQUAL(fs.check_sequence_integrity(false), true); } From 64462a9acad04034605233a32de3a46e4a0cf8ae Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 22 Jan 2020 17:25:05 +0100 Subject: [PATCH 099/119] sav --- test/check/test_check.cpp | 75 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 test/check/test_check.cpp diff --git a/test/check/test_check.cpp b/test/check/test_check.cpp new file mode 100644 index 00000000..0437bd83 --- /dev/null +++ b/test/check/test_check.cpp @@ -0,0 +1,75 @@ +#define BOOST_TEST_MODULE fastfs_check + +#include + +#include "config.hpp" + +#include "fasta_to_twobit_fastafs.hpp" +#include "fasta_to_fourbit_fastafs.hpp" + + + +BOOST_AUTO_TEST_SUITE(Testing) + +/** + * @brief + * + * @test + */ +BOOST_AUTO_TEST_CASE(test_file_integrity) +{ + fasta_to_twobit_fastafs("test/data/test.fa", "tmp/test_cache_test.fastafs"); + + // check computed file size + fastafs f = fastafs(""); + f.load("tmp/test_cache_test.fastafs"); + BOOST_REQUIRE_EQUAL(f.fastafs_filesize(), 403); + + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), true); + BOOST_CHECK_EQUAL(f.check_file_integrity(false), true); + + for(int i = 5; i < 403 - 5 - 1 - 1 ; i ++) { + char buffer[400]; + + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + std::ifstream fh_fastafs_in("tmp/test_cache_test.fastafs", std::ios::out | std::ios::binary); + std::ofstream fh_fastafs_out(tmp_file, std::ios::out | std::ios::binary); + + fh_fastafs_in.read(buffer, i); + fh_fastafs_out.write(reinterpret_cast(&buffer), i); + + // modify the i-th base to something else + fh_fastafs_in.read(buffer, 1); + if(buffer[0] == '\x01') { + buffer[0] = '\x02'; + } else { + buffer[0] = '\x01'; + } + fh_fastafs_out.write(reinterpret_cast(&buffer), 1); + + fh_fastafs_in.read(buffer, 403 - i - 1); + fh_fastafs_out.write(reinterpret_cast(&buffer), 403 - i - 1); + + fh_fastafs_in.close(); + fh_fastafs_out.close(); + + + fastafs f = fastafs(""); + f.filename = tmp_file; // don't load + BOOST_CHECK_EQUAL(f.check_file_integrity(false), false); + + // make a new loop and only test those that are affected, as the checksums can then be loaded appropriately + if(f.check_sequence_integrity(true)) { + printf("[%i][!true]\n", i); + } else { + printf("[%i][!false]\n", i); + } + + } +} + + + + + +BOOST_AUTO_TEST_SUITE_END() From 922c69a4b8018558447b85f09137c416192efce4 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 23 Jan 2020 10:56:13 +0100 Subject: [PATCH 100/119] sav --- .gitignore | 1 + src/fastafs.cpp | 3 ++ test/cache/test_cache_twobit.cpp | 4 +- test/check/test_check.cpp | 77 +++++++++++++++++++++++++++++--- 4 files changed, 76 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 75594e1d..580d522e 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ xcheck.sh /build-meson *.ninja .ninja* +test-mount.sh diff --git a/src/fastafs.cpp b/src/fastafs.cpp index b73988b9..9a7eb43d 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1643,10 +1643,13 @@ bool fastafs::check_sequence_integrity(bool verbose) if(verbose) { printf("ERROR\t%s\t%s != %s\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); } + retcode = false; } } file.close(); + } else { + throw std::runtime_error("[fastafs::check_sequence_integrity] could not load fastafs: " + this->filename); } delete cache; diff --git a/test/cache/test_cache_twobit.cpp b/test/cache/test_cache_twobit.cpp index 100b8c0e..1a493b5e 100644 --- a/test/cache/test_cache_twobit.cpp +++ b/test/cache/test_cache_twobit.cpp @@ -237,8 +237,8 @@ BOOST_AUTO_TEST_CASE(test_cache) "\x00\x00\x00\x10"s// [14, 17] seq length (16) (of 2bit encoded bytes; n-blocks are excluded) "\x00\x55\xAA\xFF"s// [18, 21] sequence "\x00\x00\x00\x00"s// [22, 25] n-blocks (0) - "\x75\x25\x5C\x6D\x90\x77\x89\x99\xAD\x36\x43\xA2\xE6\x9D\x43\x44"s// [26, 45] checksum - "\x00\x00\x00\x01"s// [46, 49] m-blocks (1) + "\x75\x25\x5C\x6D\x90\x77\x89\x99\xAD\x36\x43\xA2\xE6\x9D\x43\x44"s// [26, 41] checksum + "\x00\x00\x00\x01"s// [42, ] m-blocks (1) "\x00\x00\x00\x00"s// [50, 53] m-block starts (0) "\x00\x00\x00\x0F"s// [54, 57] m-block starts (15) "\x00\x00\x00\x0C"s// [58, 61] seq length (12) (of 2bit encoded bytes; n-blocks are excluded) diff --git a/test/check/test_check.cpp b/test/check/test_check.cpp index 0437bd83..16eb33a3 100644 --- a/test/check/test_check.cpp +++ b/test/check/test_check.cpp @@ -28,7 +28,10 @@ BOOST_AUTO_TEST_CASE(test_file_integrity) BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), true); BOOST_CHECK_EQUAL(f.check_file_integrity(false), true); - for(int i = 5; i < 403 - 5 - 1 - 1 ; i ++) { + static const int i_min = 5; + static const int i_max = 403 - 5 - 1 - 1; + + for(int i = i_min; i < i_max ; i ++) { char buffer[400]; std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; @@ -57,15 +60,75 @@ BOOST_AUTO_TEST_CASE(test_file_integrity) fastafs f = fastafs(""); f.filename = tmp_file; // don't load BOOST_CHECK_EQUAL(f.check_file_integrity(false), false); + } + + + for(int i = 18; i <= 21 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + for(int i = 58; i <= 60 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + for(int i = 113; i <= 116 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + for(int i = 157; i <= 160 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; - // make a new loop and only test those that are affected, as the checksums can then be loaded appropriately - if(f.check_sequence_integrity(true)) { - printf("[%i][!true]\n", i); - } else { - printf("[%i][!false]\n", i); - } + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + for(int i = 201; i <= 204 ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + { + int i = 245; + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); } + + { + int i = 294; + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + + fastafs f = fastafs(""); + f.load(tmp_file); + BOOST_CHECK_EQUAL(f.check_sequence_integrity(false), false); + } + + + + for(int i = i_min; i < i_max ; i ++) { + std::string tmp_file = "tmp/test_cache_test_" + std::to_string(i) + ".fastafs"; + remove(tmp_file.c_str()); + } + } From cc70d4a44b69a0f0c19f13afe63b3e2a92b0e8a3 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Thu, 23 Jan 2020 11:18:02 +0100 Subject: [PATCH 101/119] info gives compression type --- src/fastafs.cpp | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 9a7eb43d..f81600f3 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1449,11 +1449,26 @@ int fastafs::info(bool ena_verify_checksum) std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); if(file.is_open()) { - std::cout << "FASTAFS NAME: " << this->filename << "\n"; - printf("SEQUENCES: %u\n", (uint32_t) this->data.size()); + std::cout << "# FASTAFS NAME: " << this->filename << "\n"; + printf("# SEQUENCES: %u\n", (uint32_t) this->data.size()); for(uint32_t i = 0; i < this->data.size(); i++) { md5_digest_to_hash(this->data[i]->md5_digest, md5_hash); + + std::string compression_type; + if(this->data[i]->flags.is_twobit()) { + compression_type = "2bit"; + } else if(this->data[i]->flags.is_twobit()) { + compression_type = "4bit"; + } else { + compression_type = "????"; + } + + + // print sequence name & size & checksum + printf("%-24s%-12i%s %s", this->data[i]->name.c_str(), this->data[i]->n, compression_type.c_str(), md5_hash); + + if(ena_verify_checksum) { //wget header of: //https://www.ebi.ac.uk/ena/cram/md5/ @@ -1464,7 +1479,6 @@ int fastafs::info(bool ena_verify_checksum) //struct sockadfiledr_in address; int sock = 0; struct sockaddr_in serv_addr; - //std::string hello2 = "GET /ena/cram/md5/" + std::string(md5_hash) + " HTTP/1.1\r\nHost: www.ebi.ac.uk\r\nConnection: Keep-Alive\r\n\r\n"; std::string hello2 = "GET /ena/cram/md5/" + std::string(md5_hash) + " HTTP/1.1\r\nHost: www.ebi.ac.uk\r\nConnection: Keep-Alive\r\n\r\n"; //char *hello = &hello2.c_str(); @@ -1516,15 +1530,15 @@ int fastafs::info(bool ena_verify_checksum) int NNvalread = SSL_read(ssl, buffer, 32); if(NNvalread < 0) { - printf(" >%-24s%-12i%s \n", this->data[i]->name.c_str(), this->data[i]->n, md5_hash); + printf(" "); } else if(std::string(buffer).find(" 200 ") != (size_t) -1) { // sequence is in ENA - printf(" >%-24s%-12i%s https://www.ebi.ac.uk/ena/cram/md5/%s\n", this->data[i]->name.c_str(), this->data[i]->n, md5_hash, md5_hash); + printf(" https://www.ebi.ac.uk/ena/cram/md5/%s", md5_hash); } else { - printf(" >%-24s%-12i%s ---\n", this->data[i]->name.c_str(), this->data[i]->n, md5_hash); + printf(" ---"); } - } else { - printf(" >%-24s%-12i%s\n", this->data[i]->name.c_str(), this->data[i]->n, md5_hash); } + + printf("\n"); } file.close(); } From b0e1391e07e4f6624cd496ab58edde6e84a65b0a Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 27 Jan 2020 16:59:10 +0100 Subject: [PATCH 102/119] implements "fastafs cache -o ... ..." --- include/utils.hpp | 3 ++- src/fastafs.cpp | 2 +- src/main.cpp | 30 ++++++++++++++++++++++++++---- src/utils.cpp | 33 ++++++++++++++++++++++++++++++++- 4 files changed, 61 insertions(+), 7 deletions(-) diff --git a/include/utils.hpp b/include/utils.hpp index da6efd2f..1158a084 100644 --- a/include/utils.hpp +++ b/include/utils.hpp @@ -16,7 +16,8 @@ void md5_digest_to_hash(unsigned char *, char *); std::string std_string_nullbyte_safe(char *, size_t, size_t); std::string std_string_nullbyte_safe(char *, size_t); -bool is_fasta_file(char *filename); +bool is_fasta_file(char *); +bool is_ucsc2bit_file(char *); std::string basename_cpp(std::string); std::string realpath_cpp(std::string); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index f81600f3..65de5373 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1537,7 +1537,7 @@ int fastafs::info(bool ena_verify_checksum) printf(" ---"); } } - + printf("\n"); } file.close(); diff --git a/src/main.cpp b/src/main.cpp index d21186bf..167d333e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -67,7 +67,9 @@ void usage_check(void) void usage_cache(void) { - std::cout << "usage: " << PACKAGE << " cache \n\n"; + std::cout << "usage: " << PACKAGE << " cache \n"; + std::cout << " cache -o \n\n"; + std::cout << " -o, --output-file Explicitly define fastafs output file and do not write to database (cache)" << std::endl; std::cout << "\n"; } @@ -92,13 +94,33 @@ int main(int argc, char *argv[]) exit(0); } else if(strcmp(argv[1], "cache") == 0) { if(argc > 3) { - database d = database(); - std::string fname_out = d.add(argv[argc - 2]); + bool to_cache = true; + if(argc > 4 && strlen(argv[argc - 3]) >= 2) { + if( + (strcmp(argv[argc - 3], "-o") == 0) + or + (strcmp(argv[argc - 3], "--output-file") == 0) + + ) { + to_cache = false; + } + } + + std::string fname_out; + if(to_cache) { + database d = database(); + std::string fname_out = d.add(argv[argc - 2]); + } else { + fname_out = std::string(argv[argc - 2]); + } if(is_fasta_file(argv[argc - 1])) { fasta_to_twobit_fastafs(argv[argc - 1], fname_out); - } else { + } else if(is_ucsc2bit_file(argv[argc - 1])) { ucsc2bit_to_fastafs(argv[argc - 1], fname_out); + } else { + throw std::runtime_error("[main::cache] Invalid file format"); + return 1; } } else { usage_cache(); diff --git a/src/utils.cpp b/src/utils.cpp index 2d073b81..bc5f892a 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -156,7 +156,7 @@ bool is_fasta_file(char *filename) FILE *fp; if((fp = fopen(filename, "rb")) == NULL) { - fclose(fp); + //fclose(fp); segfault if NULL throw std::runtime_error("Could not read first byte of putative FASTA file."); return false; } @@ -166,6 +166,37 @@ bool is_fasta_file(char *filename) return (buf[0] == '>');// return true if first byte equals > } else { fclose(fp); + + throw std::runtime_error("Could not read sufficient data."); + } + + return false; +} + + + +bool is_ucsc2bit_file(char *filename) +{ + char buf[2]; + FILE *fp; + + if((fp = fopen(filename, "rb")) == NULL) { + //fclose(fp); segfault if NULL + throw std::runtime_error("Could not read first byte of putative FASTA file."); + return false; + } + + if(fread(buf, 1, 4, fp) == 4) { + fclose(fp); + return ( + buf[0] == UCSC2BIT_MAGIC[0] and + buf[1] == UCSC2BIT_MAGIC[1] and + buf[2] == UCSC2BIT_MAGIC[2] and + buf[3] == UCSC2BIT_MAGIC[3] + );// return true if first byte equals > + } else { + fclose(fp); + throw std::runtime_error("Could not read sufficient data."); } From 9dc91f8898e46d1007b5f60f2865fdce0a128154 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 27 Jan 2020 20:06:46 +0100 Subject: [PATCH 103/119] implements determining size of virtual /seq/... files --- include/fastafs.hpp | 2 ++ src/fastafs.cpp | 75 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index d27b568f..625d5f03 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -75,6 +75,7 @@ class fastafs_seq uint32_t fasta_filesize(uint32_t padding); void view_fasta(ffs2f_init_seq*, std::ifstream *); + uint32_t view_sequence_region_size(ffs2f_init_seq*, sequence_region*, off_t, std::ifstream *); uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region*, char *, size_t, off_t, std::ifstream *); uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); template uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); @@ -120,6 +121,7 @@ class fastafs void load(std::string); void view_fasta(ffs2f_init*); + uint32_t view_sequence_region_size(ffs2f_init*, const char *, off_t); // read stuff like "chr1:123-456" into the buffer uint32_t view_sequence_region(ffs2f_init*, const char *, char*, size_t, off_t); // read stuff like "chr1:123-456" into the buffer uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t); uint32_t view_faidx_chunk(uint32_t, char *, size_t, off_t); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 65de5373..3aab5392 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -320,6 +320,47 @@ template uint32_t fastafs_seq::view_fasta_chunk_generalized( +uint32_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_region* sr, off_t offset, std::ifstream *fh) +{ +#if DEBUG + if(cache == nullptr) { + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 01\n"); + } + + if(sr == nullptr) { + throw std::invalid_argument("fastafs_seq::view_sequence_region - error 02\n"); + } + +#endif + + uint32_t written = 0; + + size_t total_requested_size; + if(sr->has_defined_end) { + total_requested_size = std::min((size_t) this->n, (size_t) sr->end + 1); + } else { + total_requested_size = this->n; + } + + total_requested_size -= sr->start; + total_requested_size -= offset; + //total_requested_size = std::min(size, total_requested_size); + +/* + written = (uint32_t) this->view_fasta_chunk( + cache, // ffs2f_init_seq* cache, + buffer, // char *buffer + (size_t) total_requested_size, // size_t buffer_size, + (off_t) 2 + this->name.size() + sr->start + offset, // offset is for chunked reading + fh + ); + + return written; + */ + + return total_requested_size; +} + uint32_t fastafs_seq::view_sequence_region(ffs2f_init_seq* cache, sequence_region* sr, char *buffer, size_t size, off_t offset, std::ifstream *fh) { #if DEBUG @@ -771,6 +812,40 @@ ffs2f_init* fastafs::init_ffs2f(uint32_t padding, bool allow_masking) +uint32_t fastafs::view_sequence_region_size(ffs2f_init* cache, const char *seq_region_arg, off_t file_offset) +{ +#if DEBUG + if(cache == nullptr) { + throw std::invalid_argument("fastafs::view_sequence_region - error 01\n"); + } + + if(cache->padding_arg != 0) { + throw std::invalid_argument("fastafs::view_sequence_region - error 02\n"); + } + + if(cache->sequences.size() == 0) { + throw std::invalid_argument("fastafs::view_sequence_region - error 03\n"); + } +#endif + + std::ifstream file(this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate); + if(file.is_open()) { + // parse "chr..:..-.." string + sequence_region sr = sequence_region(seq_region_arg); + + // 02 : check if 'chr' is equals this->data[i].name + for(size_t i = 0; i < this->data.size(); i++) { + if(sr.seq_name.compare(this->data[i]->name) == 0) { + return this->data[i]->view_sequence_region_size(cache->sequences[i], &sr, file_offset, &file); + } + } + } + + return 0; +} + + + uint32_t fastafs::view_sequence_region(ffs2f_init* cache, const char *seq_region_arg, char *buffer, size_t buffer_size, off_t file_offset) { #if DEBUG From 787517fb1e0f7c1735ef4b3d6585478c6e8b185c Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 27 Jan 2020 20:23:43 +0100 Subject: [PATCH 104/119] tests --- test/fastafs/test_fastafs.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index dbe20f54..7ff6f15b 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -403,10 +403,11 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5), 0); + BOOST_CHECK_EQUAL(written, 1); + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 't'); } @@ -664,8 +665,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5), 0); + BOOST_CHECK_EQUAL(written, 8); + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); BOOST_CHECK_EQUAL(written, 8); BOOST_CHECK_EQUAL(buffer[0], 'A'); BOOST_CHECK_EQUAL(buffer[1], 'C'); @@ -688,6 +691,8 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5), 0); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); BOOST_CHECK_EQUAL(written, 4); @@ -804,8 +809,11 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); // small buffer size + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5), 0); + BOOST_CHECK_EQUAL(written, 0); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); // small buffer size BOOST_CHECK_EQUAL(written, 0); } From fadbe6fec981dc222166cb1aaf025bdfc3d6ce32 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 27 Jan 2020 20:56:42 +0100 Subject: [PATCH 105/119] adds big number of additional unit tests for file sizes --- include/fastafs.hpp | 4 +- src/fastafs.cpp | 31 ++++---- test/fastafs/test_fastafs.cpp | 137 +++++++++++++++++++++++++--------- 3 files changed, 118 insertions(+), 54 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 625d5f03..2012fa64 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -75,7 +75,7 @@ class fastafs_seq uint32_t fasta_filesize(uint32_t padding); void view_fasta(ffs2f_init_seq*, std::ifstream *); - uint32_t view_sequence_region_size(ffs2f_init_seq*, sequence_region*, off_t, std::ifstream *); + uint32_t view_sequence_region_size(ffs2f_init_seq*, sequence_region*, std::ifstream *); uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region*, char *, size_t, off_t, std::ifstream *); uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); template uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); @@ -121,7 +121,7 @@ class fastafs void load(std::string); void view_fasta(ffs2f_init*); - uint32_t view_sequence_region_size(ffs2f_init*, const char *, off_t); // read stuff like "chr1:123-456" into the buffer + uint32_t view_sequence_region_size(ffs2f_init*, const char *); // read stuff like "chr1:123-456" into the buffer uint32_t view_sequence_region(ffs2f_init*, const char *, char*, size_t, off_t); // read stuff like "chr1:123-456" into the buffer uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t); uint32_t view_faidx_chunk(uint32_t, char *, size_t, off_t); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 3aab5392..c7242133 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -320,7 +320,7 @@ template uint32_t fastafs_seq::view_fasta_chunk_generalized( -uint32_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_region* sr, off_t offset, std::ifstream *fh) +uint32_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_region* sr, std::ifstream *fh) { #if DEBUG if(cache == nullptr) { @@ -333,7 +333,6 @@ uint32_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_ #endif - uint32_t written = 0; size_t total_requested_size; if(sr->has_defined_end) { @@ -343,20 +342,20 @@ uint32_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_ } total_requested_size -= sr->start; - total_requested_size -= offset; + //total_requested_size -= offset; //total_requested_size = std::min(size, total_requested_size); -/* - written = (uint32_t) this->view_fasta_chunk( - cache, // ffs2f_init_seq* cache, - buffer, // char *buffer - (size_t) total_requested_size, // size_t buffer_size, - (off_t) 2 + this->name.size() + sr->start + offset, // offset is for chunked reading - fh - ); + /* + written = (uint32_t) this->view_fasta_chunk( + cache, // ffs2f_init_seq* cache, + buffer, // char *buffer + (size_t) total_requested_size, // size_t buffer_size, + (off_t) 2 + this->name.size() + sr->start + offset, // offset is for chunked reading + fh + ); - return written; - */ + return written; + */ return total_requested_size; } @@ -811,8 +810,8 @@ ffs2f_init* fastafs::init_ffs2f(uint32_t padding, bool allow_masking) - -uint32_t fastafs::view_sequence_region_size(ffs2f_init* cache, const char *seq_region_arg, off_t file_offset) +// estimates the whole file size of a file such as "/seq/chr1:56-" +uint32_t fastafs::view_sequence_region_size(ffs2f_init* cache, const char *seq_region_arg) { #if DEBUG if(cache == nullptr) { @@ -836,7 +835,7 @@ uint32_t fastafs::view_sequence_region_size(ffs2f_init* cache, const char *seq_r // 02 : check if 'chr' is equals this->data[i].name for(size_t i = 0; i < this->data.size(); i++) { if(sr.seq_name.compare(this->data[i]->name) == 0) { - return this->data[i]->view_sequence_region_size(cache->sequences[i], &sr, file_offset, &file); + return this->data[i]->view_sequence_region_size(cache->sequences[i], &sr, &file); } } } diff --git a/test/fastafs/test_fastafs.cpp b/test/fastafs/test_fastafs.cpp index 7ff6f15b..ca5e12fc 100644 --- a/test/fastafs/test_fastafs.cpp +++ b/test/fastafs/test_fastafs.cpp @@ -403,10 +403,9 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5), 0); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); BOOST_CHECK_EQUAL(written, 1); - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 't'); @@ -421,10 +420,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 't'); } @@ -438,10 +437,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'c'); } @@ -455,10 +454,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'g'); } @@ -472,9 +471,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 0); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - BOOST_CHECK_EQUAL(written, 0); //BOOST_CHECK_EQUAL(buffer[0], '\n'); } @@ -490,10 +490,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'A'); } @@ -508,10 +508,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'C'); } @@ -525,10 +525,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'G'); } @@ -543,10 +543,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'n'); } @@ -560,10 +560,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'n'); } @@ -577,10 +577,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'n'); } @@ -594,10 +594,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'n'); } @@ -612,10 +612,11 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'A'); } @@ -630,14 +631,51 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 1); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - - BOOST_CHECK_EQUAL(written, 1); BOOST_CHECK_EQUAL(buffer[0], 'G'); } + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:14-15"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 2); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'T'); + BOOST_CHECK_EQUAL(buffer[1], 'G'); + } + + { + ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence + const char arg[] = "/seq/chr2:14-99999"; + + size_t written; + char *buffer; + + buffer = new char[READ_BUFFER_SIZE + 1]; + flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 2); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + BOOST_CHECK_EQUAL(written, 2); + BOOST_CHECK_EQUAL(buffer[0], 'T'); + BOOST_CHECK_EQUAL(buffer[1], 'G'); + } { ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence @@ -649,8 +687,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 0); + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); BOOST_CHECK_EQUAL(written, 0); //BOOST_CHECK_EQUAL(buffer[0], 'G'); } @@ -665,7 +705,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5), 0); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); BOOST_CHECK_EQUAL(written, 8); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); @@ -691,10 +731,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 4); - written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5), 0); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); - BOOST_CHECK_EQUAL(written, 4); BOOST_CHECK_EQUAL(buffer[0], 'n'); BOOST_CHECK_EQUAL(buffer[1], 'n'); @@ -705,7 +745,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) { ffs2f_init* cache_p0 = fs.init_ffs2f(0, true); // @ padding 0 as it reflects actual plain sequence - const char arg[] = "/seq/chr4:-1"; + const char arg[] = "/seq/chr4:-1";// from left to 1: <0,1] size_t written; char *buffer; @@ -713,8 +753,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 2); + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); BOOST_CHECK_EQUAL(written, 2); BOOST_CHECK_EQUAL(buffer[0], 'A'); BOOST_CHECK_EQUAL(buffer[1], 'C'); @@ -733,8 +775,11 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 2); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); BOOST_CHECK_EQUAL(written, 2); BOOST_CHECK_EQUAL(buffer[0], 'C'); BOOST_CHECK_EQUAL(buffer[1], 'T'); @@ -752,6 +797,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 0); // small buffer size BOOST_CHECK_EQUAL(written, 4); BOOST_CHECK_EQUAL(buffer[0], 'A'); @@ -759,6 +808,9 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) BOOST_CHECK_EQUAL(buffer[2], 'T'); BOOST_CHECK_EQUAL(buffer[3], 'G'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 4); // small buffer size BOOST_CHECK_EQUAL(written, 4); BOOST_CHECK_EQUAL(buffer[0], 'A'); @@ -766,6 +818,10 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) BOOST_CHECK_EQUAL(buffer[2], 'T'); BOOST_CHECK_EQUAL(buffer[3], 'G'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 8); // small buffer size BOOST_CHECK_EQUAL(written, 4); BOOST_CHECK_EQUAL(buffer[0], 'a'); @@ -773,11 +829,17 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) BOOST_CHECK_EQUAL(buffer[2], 'a'); BOOST_CHECK_EQUAL(buffer[3], 'a'); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, 4, 12); // small buffer size BOOST_CHECK_EQUAL(written, 3); BOOST_CHECK_EQUAL(buffer[0], 'c'); BOOST_CHECK_EQUAL(buffer[1], 'c'); BOOST_CHECK_EQUAL(buffer[2], 'c'); + + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 15); } @@ -791,8 +853,11 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) buffer = new char[READ_BUFFER_SIZE + 1]; flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 2); // small buffer size + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); + BOOST_CHECK_EQUAL(written, 4); + + written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 2); // small buffer size BOOST_CHECK_EQUAL(written, 2); BOOST_CHECK_EQUAL(buffer[0], 'T'); BOOST_CHECK_EQUAL(buffer[1], 'G'); @@ -810,7 +875,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs__sequence_virtualization) flush_buffer(buffer, READ_BUFFER_SIZE, '\0'); - written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5), 0); + written = fs.view_sequence_region_size(cache_p0, (strchr(arg, '/') + 5)); BOOST_CHECK_EQUAL(written, 0); written = fs.view_sequence_region(cache_p0, (strchr(arg, '/') + 5), buffer, READ_BUFFER_SIZE, 0); // small buffer size From 5d090ced4094e9be9ad6f5e2bcacfa3f41603bef Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 27 Jan 2020 21:18:54 +0100 Subject: [PATCH 106/119] implements virtual file size estimation of api type file access --- include/fastafs.hpp | 4 ++-- src/fastafs.cpp | 19 ++----------------- src/fuse.cpp | 9 +++++---- src/main.cpp | 2 +- 4 files changed, 10 insertions(+), 24 deletions(-) diff --git a/include/fastafs.hpp b/include/fastafs.hpp index 2012fa64..18922096 100644 --- a/include/fastafs.hpp +++ b/include/fastafs.hpp @@ -75,7 +75,7 @@ class fastafs_seq uint32_t fasta_filesize(uint32_t padding); void view_fasta(ffs2f_init_seq*, std::ifstream *); - uint32_t view_sequence_region_size(ffs2f_init_seq*, sequence_region*, std::ifstream *); + size_t view_sequence_region_size(ffs2f_init_seq*, sequence_region*, std::ifstream *); uint32_t view_sequence_region(ffs2f_init_seq*, sequence_region*, char *, size_t, off_t, std::ifstream *); uint32_t view_fasta_chunk(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); template uint32_t view_fasta_chunk_generalized(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *); @@ -121,7 +121,7 @@ class fastafs void load(std::string); void view_fasta(ffs2f_init*); - uint32_t view_sequence_region_size(ffs2f_init*, const char *); // read stuff like "chr1:123-456" into the buffer + size_t view_sequence_region_size(ffs2f_init*, const char *); // read stuff like "chr1:123-456" into the buffer uint32_t view_sequence_region(ffs2f_init*, const char *, char*, size_t, off_t); // read stuff like "chr1:123-456" into the buffer uint32_t view_fasta_chunk(ffs2f_init*, char*, size_t, off_t); uint32_t view_faidx_chunk(uint32_t, char *, size_t, off_t); diff --git a/src/fastafs.cpp b/src/fastafs.cpp index c7242133..29276164 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -320,7 +320,7 @@ template uint32_t fastafs_seq::view_fasta_chunk_generalized( -uint32_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_region* sr, std::ifstream *fh) +size_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_region* sr, std::ifstream *fh) { #if DEBUG if(cache == nullptr) { @@ -342,21 +342,6 @@ uint32_t fastafs_seq::view_sequence_region_size(ffs2f_init_seq* cache, sequence_ } total_requested_size -= sr->start; - //total_requested_size -= offset; - //total_requested_size = std::min(size, total_requested_size); - - /* - written = (uint32_t) this->view_fasta_chunk( - cache, // ffs2f_init_seq* cache, - buffer, // char *buffer - (size_t) total_requested_size, // size_t buffer_size, - (off_t) 2 + this->name.size() + sr->start + offset, // offset is for chunked reading - fh - ); - - return written; - */ - return total_requested_size; } @@ -811,7 +796,7 @@ ffs2f_init* fastafs::init_ffs2f(uint32_t padding, bool allow_masking) // estimates the whole file size of a file such as "/seq/chr1:56-" -uint32_t fastafs::view_sequence_region_size(ffs2f_init* cache, const char *seq_region_arg) +size_t fastafs::view_sequence_region_size(ffs2f_init* cache, const char *seq_region_arg) { #if DEBUG if(cache == nullptr) { diff --git a/src/fuse.cpp b/src/fuse.cpp index 8f209b14..2208782d 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -79,7 +79,7 @@ static int do_getattr(const char *path, struct stat *st) st->st_nlink = 2; // Why "two" hardlinks instead of "one"? The answer is here: http://unix.stackexchange.com/a/101536 } else if(strlen(path) == 4 && strncmp(path, "/seq", 4) == 0) { //directory - printf("setting to DIR because /seq\n"); + //printf("setting to DIR because /seq\n"); st->st_mode = S_IFDIR | 0755; st->st_nlink = 1; } else if(strlen(path) > 4 && strncmp(path, "/seq/", 5) == 0) { @@ -90,7 +90,7 @@ static int do_getattr(const char *path, struct stat *st) st->st_nlink = 1; //@todo this needs to be defined with some api stuff:!! - st->st_size = 4096; + st->st_size = (signed int) ffi->f->view_sequence_region_size(ffi->cache_p0, (strchr(path, '/') + 5)); } else { st->st_mode = S_IFREG | 0644; st->st_nlink = 1; @@ -112,9 +112,10 @@ static int do_getattr(const char *path, struct stat *st) st->st_size = ffi->f->ucsc2bit_filesize(); } else if(strcmp(path, virtual_dict_filename.c_str()) == 0) { st->st_size = ffi->f->dict_filesize(); - } else if(strncmp(path, "/seq/", 5) == 0) { // api access - printf("filesize: set to 4096\n"); } + //else if(strncmp(path, "/seq/", 5) == 0) { // api access + //printf("filesize: set to 4096\n"); + //} } } else { if(ffi->u2b != nullptr) { diff --git a/src/main.cpp b/src/main.cpp index 167d333e..0ea94c55 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -109,7 +109,7 @@ int main(int argc, char *argv[]) std::string fname_out; if(to_cache) { database d = database(); - std::string fname_out = d.add(argv[argc - 2]); + fname_out = d.add(argv[argc - 2]); } else { fname_out = std::string(argv[argc - 2]); } From 81a0a2dcfa7d25e14d91f37cdc470d3e293a6612 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Mon, 27 Jan 2020 21:23:09 +0100 Subject: [PATCH 107/119] update changelog with some of the new features --- Changelog | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Changelog b/Changelog index eb55a392..2b8276e7 100644 --- a/Changelog +++ b/Changelog @@ -1,6 +1,8 @@ -2012-01-15 Youri Hoogstrate +2020-01-27 Youri Hoogstrate * v1.7.0 + * `fastafs cache -o` for custom output files and bypassing the config + * Random access retrievement via file system: `/seq/chr1:100-200` * CRC32 checksums for file integratity * converting to meson because of insane build times using cmake+make and re-building files that have not changed From d4d84b4ce6ccdd6cc4678f73a556a04a7df324ec Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 28 Jan 2020 16:22:51 +0100 Subject: [PATCH 108/119] implements auto conversion to 4bit per-whole-file --- CMakeLists.txt | 1 + src/fasta_to_twobit_fastafs.cpp | 3 +-- src/fastafs.cpp | 2 +- src/main.cpp | 11 ++++++++--- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 681e5fbd..cfdf3678 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -103,6 +103,7 @@ endif() add_executable(fastafs src/main.cpp src/fasta_to_twobit_fastafs.cpp + src/fasta_to_fourbit_fastafs.cpp src/ucsc2bit_to_fastafs.cpp src/flags.cpp src/fastafs.cpp diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index b6cf5868..27b10a27 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -267,8 +267,7 @@ size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string f MD5_Update(&s->ctx, nn, 1); break; default: - std::cerr << "invalid chars in FASTA file" << std::endl; - exit(1); + throw std::runtime_error("[fasta_to_twobit_fastafs] invalid chars in FASTA file"); break; } } diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 29276164..bd745e44 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -1517,7 +1517,7 @@ int fastafs::info(bool ena_verify_checksum) std::string compression_type; if(this->data[i]->flags.is_twobit()) { compression_type = "2bit"; - } else if(this->data[i]->flags.is_twobit()) { + } else if(this->data[i]->flags.is_fourbit()) { compression_type = "4bit"; } else { compression_type = "????"; diff --git a/src/main.cpp b/src/main.cpp index 0ea94c55..9883aff7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,6 +7,7 @@ #include "config.hpp" #include "fasta_to_twobit_fastafs.hpp" +#include "fasta_to_fourbit_fastafs.hpp" #include "ucsc2bit_to_fastafs.hpp" #include "database.hpp" #include "fuse.hpp" @@ -69,8 +70,7 @@ void usage_cache(void) { std::cout << "usage: " << PACKAGE << " cache \n"; std::cout << " cache -o \n\n"; - std::cout << " -o, --output-file Explicitly define fastafs output file and do not write to database (cache)" << std::endl; - std::cout << "\n"; + std::cout << " -o, --output-file Explicitly define fastafs output file and do not write to database (cache)\n"; } int main(int argc, char *argv[]) @@ -115,7 +115,12 @@ int main(int argc, char *argv[]) } if(is_fasta_file(argv[argc - 1])) { - fasta_to_twobit_fastafs(argv[argc - 1], fname_out); + try { + fasta_to_twobit_fastafs(argv[argc - 1], fname_out); + } + catch(std::runtime_error) { + fasta_to_fourbit_fastafs(argv[argc - 1], fname_out); + } } else if(is_ucsc2bit_file(argv[argc - 1])) { ucsc2bit_to_fastafs(argv[argc - 1], fname_out); } else { From c13cd47f847682d6192769aad24fcbb6317f01c8 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 28 Jan 2020 17:01:45 +0100 Subject: [PATCH 109/119] 4bit checksums ok --- include/utils.hpp | 1 + src/fasta_to_fourbit_fastafs.cpp | 8 ++++---- src/fastafs.cpp | 24 ++++++++++++++++++------ src/utils.cpp | 12 ++++++++++++ 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/include/utils.hpp b/include/utils.hpp index 1158a084..8cc89094 100644 --- a/include/utils.hpp +++ b/include/utils.hpp @@ -6,6 +6,7 @@ uint32_t fourbytes_to_uint_ucsc2bit(char *, unsigned char); uint16_t twobytes_to_uint(char *); void uint_to_twobytes(char *chars, uint16_t n); +size_t remove_chars(char *s, int c, size_t l);// to remove - characters from string void uint_to_fourbytes(char *, uint32_t); void uint_to_fourbytes_ucsc2bit(char *, uint32_t); diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index 275f8baf..dc33ab89 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -177,6 +177,7 @@ size_t fasta_to_fourbit_fastafs(const std::string &fasta_file, const std::string fh_fastafs << "\x00\x00\x00\x00"s;// number of 2bit encoded nucleotides, not yet known index.push_back(s); } else { + std::cout << "{"; for(std::string::iterator it = line.begin(); it != line.end(); ++it) { switch(*it) { @@ -206,7 +207,7 @@ size_t fasta_to_fourbit_fastafs(const std::string &fasta_file, const std::string } s->add_ACTG(1, fh_fastafs); - MD5_Update(&s->ctx, na, 1); + MD5_Update(&s->ctx, nc, 1); break; case 'c': if(!s->in_m_block) { @@ -216,7 +217,7 @@ size_t fasta_to_fourbit_fastafs(const std::string &fasta_file, const std::string } s->add_ACTG(1, fh_fastafs); - MD5_Update(&s->ctx, na, 1); + MD5_Update(&s->ctx, nc, 1); break; case 'G': if(s->in_m_block) { @@ -513,8 +514,7 @@ size_t fasta_to_fourbit_fastafs(const std::string &fasta_file, const std::string break; default: - std::cerr << "invalid chars in FASTA file" << std::endl; - exit(1); + throw std::runtime_error("[fasta_to_fourbit_fastafs] invalid chars in FASTA file"); break; } } diff --git a/src/fastafs.cpp b/src/fastafs.cpp index bd745e44..8848f638 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -462,6 +462,7 @@ std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh) } + std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) { #if DEBUG @@ -487,18 +488,29 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) unsigned long n_iterations = (unsigned long) this->n / chunksize; signed int remaining_bytes = this->n % chunksize; + size_t written; // half iteration remainder = this->n % chunk_size; if this number > 0; do it too for(uint32_t i = 0; i < n_iterations; i++) { - this->view_fasta_chunk(cache, chunk, + written = this->view_fasta_chunk(cache, chunk, chunksize, header_offset + (i * chunksize), fh); - MD5_Update(&ctx, chunk, chunksize); + + if(this->flags.is_fourbit()) { + written = remove_chars(chunk, '-', written); + } + + MD5_Update(&ctx, chunk, written); } if(remaining_bytes > 0) { - this->view_fasta_chunk(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); - MD5_Update(&ctx, chunk, remaining_bytes); + written = this->view_fasta_chunk(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); + + if(this->flags.is_fourbit()) { + written = remove_chars(chunk, '-', written); + } + + MD5_Update(&ctx, chunk, written); chunk[remaining_bytes] = '\0'; } @@ -1660,7 +1672,7 @@ bool fastafs::check_file_integrity(bool verbose) uint_to_fourbytes(buf_new, (uint32_t) crc32_current); if(verbose) { - printf("ERROR\t%02hhx%02hhx%02hhx%02hhx (in-file) != %02hhx%02hhx%02hhx%02hhx (actual file)\n--\n", + printf("ERROR\t%02hhx%02hhx%02hhx%02hhx (encoded in fastafs) != %02hhx%02hhx%02hhx%02hhx (on disk)\n--\n", (unsigned char) buf_old[0], (unsigned char) buf_old[1], (unsigned char) buf_old[2], @@ -1714,7 +1726,7 @@ bool fastafs::check_sequence_integrity(bool verbose) } } else { if(verbose) { - printf("ERROR\t%s\t%s != %s\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); + printf("ERROR\t%s\t%s (encoded in fastafs) != %s (on disk)\n", this->data[i]->name.c_str(), md5_hash, new_hash.c_str()); } retcode = false; diff --git a/src/utils.cpp b/src/utils.cpp index bc5f892a..cae392cc 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -63,6 +63,18 @@ void uint_to_twobytes(char *chars, uint16_t n) +size_t remove_chars(char *s, int c, size_t l){ + size_t j = 0; + size_t n = l; + + for (size_t i=j=0; i < n; i++) + if (s[i] != c) + s[j++] = s[i]; + + s[j] = '\0'; + + return j; +} void uint_to_fourbytes(char *chars, uint32_t n) From a39eddbf810d7a828d33d6dbe8c913bbffd58948 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Tue, 28 Jan 2020 17:12:45 +0100 Subject: [PATCH 110/119] updated tests --- test/cache/test_cache_fourbit.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 2e908f74..42c0c4b1 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -162,7 +162,7 @@ BOOST_AUTO_TEST_CASE(test_cache) "\x00\x00\x00\x4D"s// [, ] n-block[1] starts (77) "\x00\x00\x00\x24"s// [, ] n-block[0] ends (36|37) "\x00\x00\x00\x4F"s// [, ] n-block[1] ends (79) - "\xEE\x09\x2F\x63\x4F\x6C\x87\xD0\x6B\x57\x1F\x07\xD1\x42\x73\x00"s// [76, ] checksum + "\x4A\x4D\x43\xFF\x09\x08\x29\xCD\x05\x9A\x08\x3C\x48\x3F\xEB\x3C"s// [76, ] checksum "\x00\x00\x00\x01"s// [92, ] m-blocks (1) "\x00\x00\x00\x35"s// [96, ] m-block starts (53) "\x00\x00\x00\x44"s// [100, ] m-block starts (68) @@ -177,7 +177,7 @@ BOOST_AUTO_TEST_CASE(test_cache) "\x00"s // [120] no metadata fields [padding will come soon?] // CRC32 - "\x41\x2f\x3c\x72"s + "\x3d\xbf\x6e\xbf"s ; BOOST_CHECK_EQUAL(written, 125); @@ -202,10 +202,10 @@ BOOST_AUTO_TEST_CASE(test_cache) for(unsigned int i = 0; i < size; i++) { BOOST_CHECK_EQUAL(buffer[i], reference[i]); - //if(reference[i] != buffer[i]) { - // printf("comparing char %u\n", i); - // printf(" ** mismatch [%d] [ref] %d != [buf] %d (%c x %02hhX)\n", i, reference[i], buffer[i], buffer[i], buffer[i]); - //} + if(reference[i] != buffer[i]) { + printf("comparing char %u\n", i); + printf(" ** mismatch [%d] [ref] %d != [buf] %d (%c x %02hhX)\n", i, reference[i], buffer[i], buffer[i], buffer[i]); + } } From 7e0c07aec6df6a2e189e0bb29183fe7695023b10 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 29 Jan 2020 14:45:53 +0100 Subject: [PATCH 111/119] guix build script --- CMakeLists.txt | 6 ++++++ guix.scm | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 guix.scm diff --git a/CMakeLists.txt b/CMakeLists.txt index cfdf3678..f51d74e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,10 @@ else() endif() configure_file("include/config.hpp.in" "include/config.hpp") +configure_file("include/config.hpp.in" "${CMAKE_CURRENT_BINARY_DIR}/config.hpp") +configure_file("include/config.hpp.in" "${CMAKE_CURRENT_SOURCE_DIR}/config.hpp") +configure_file("include/config.hpp.in" "${BUILD_DIR}/config.hpp") + # ---------------------------------------------------------------------- # ------------------------------ Styling ------------------------------- @@ -75,6 +79,8 @@ add_custom_target(tidy DEPENDS make_tidy ) add_subdirectory(src) include_directories(include) +include_directories(${BUILD_DIR}) +include_directories(${BUILD_DIR}"/include") add_definitions(-std=c++14) # Boost diff --git a/guix.scm b/guix.scm new file mode 100644 index 00000000..0123e7f0 --- /dev/null +++ b/guix.scm @@ -0,0 +1,38 @@ +;; guix package --install-from-file=/home/youri/src/fastafs/guix.scm +;; https://guix.gnu.org/blog/2018/a-packaging-tutorial-for-guix/ + +(use-modules (guix packages) + (guix download) + (guix git-download) + (guix build-system gnu) + (guix build-system cmake) + (guix licenses) + (gnu packages boost) + (gnu packages compression) + (gnu packages tls) + (gnu packages linux)) + +(package + (name "fastafs") + (version "2.10") + (source (origin + (method url-fetch) + ; (uri (string-append "https://github.com/yhoogstrate/fastafs/archive/a39eddbf810d7a828d33d6dbe8c913bbffd58948.tar.gz")) + (uri (string-append "file:///home/youri/.local/src/fastafs.tar.gz")) + (sha256 + (base32 + "1njzvaxy1nq4202ispphyxddihq1x1cmfzbl8zmkqiwa028k540c")))) + (build-system cmake-build-system) + (arguments + `(#:build-type "debug")) + (inputs + `(("boost" ,boost) + ("zlib" ,zlib) + ("openssl" ,openssl) + ("fuse" ,fuse) + )) + (synopsis "fastafs") + (description + "fastafs.") + (home-page "https://,..,") + (license gpl3+)) From 83d787c343e9e5988f88e75cc4e0a8889251e9b5 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 29 Jan 2020 15:09:55 +0100 Subject: [PATCH 112/119] guix --- guix.scm | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/guix.scm b/guix.scm index 0123e7f0..2e9c87c2 100644 --- a/guix.scm +++ b/guix.scm @@ -14,7 +14,7 @@ (package (name "fastafs") - (version "2.10") + (version "1.7.0") (source (origin (method url-fetch) ; (uri (string-append "https://github.com/yhoogstrate/fastafs/archive/a39eddbf810d7a828d33d6dbe8c913bbffd58948.tar.gz")) @@ -24,7 +24,9 @@ "1njzvaxy1nq4202ispphyxddihq1x1cmfzbl8zmkqiwa028k540c")))) (build-system cmake-build-system) (arguments - `(#:build-type "debug")) + `(#:build-type "debug" + #:tests? #f) ; skip tests that fail because test data is not in build path + ) (inputs `(("boost" ,boost) ("zlib" ,zlib) @@ -33,6 +35,6 @@ )) (synopsis "fastafs") (description - "fastafs.") - (home-page "https://,..,") - (license gpl3+)) + "fastafs: toolkit for file system virtualisation of random access compressed FASTA, FAI, DICT & TWOBIT files") + (home-page "https://github.com/yhoogstrate/fastafs") + (license gpl2+)) From 162ff414f4906c24dc8d61e1a5de603084e765e8 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 29 Jan 2020 20:01:20 +0100 Subject: [PATCH 113/119] fixed ssize_t/size_t compiler warning --- src/sequence_region.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/sequence_region.cpp b/src/sequence_region.cpp index f87e05dc..6ec5b34c 100644 --- a/src/sequence_region.cpp +++ b/src/sequence_region.cpp @@ -24,17 +24,18 @@ void sequence_region::parse(const char * seqstr) { // the + 1 is the also allow parsing "sequence-of-size-255-...-:123-345" size_t string_max_pos = std::min(MAX_SIZE_SEQ_NAME + 1, strlen(seqstr)); - ssize_t p = -1; - for(size_t i = 0; i < string_max_pos && p == -1; i++) { + size_t p = 0; + bool proceed = true; + for(size_t i = 0; i < string_max_pos && proceed == true; i++) { if(seqstr[i] == ':') { - p = (size_t) i; + p = i; + proceed = false; } } if(p > 0) { this->seq_name = std::string(seqstr, 0, p); - } else if(p == -1) { - + } else if(p == false) { // either with string > 255 chars or string smaller than 255 without ':' this->seq_name = std::string(seqstr, 0, string_max_pos); } @@ -42,20 +43,22 @@ void sequence_region::parse(const char * seqstr) // chr1:1 // p = 4 // strlen = 6 - if(p != -1 and strlen(seqstr) > (p + 1)) { + if(proceed == false and strlen(seqstr) > (p + 1)) { // we can parse numbers // find position of '-' character - ssize_t p2 = -1; + size_t p2 = 0; + bool proceed2 = true; - for(size_t i = p; i < strlen(seqstr) && p2 == -1; i++) { + for(size_t i = p; i < strlen(seqstr) && proceed2 == true; i++) { if(seqstr[i] == '-') { p2 = (size_t) i; + proceed2 = false; } } - if(p2 == -1) { // chrA:123 + if(proceed2 == true) { // chrA:123 std::string start = std::string(seqstr, p + 1, p2 - p - 1); this->start = std::stoi(start); From ac38f4028d387fb53dcbf6cd3f9bcede8b47bd3c Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 29 Jan 2020 20:22:59 +0100 Subject: [PATCH 114/119] updates cmake file --- CMakeLists.txt | 12 ++++++------ src/main.cpp | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f51d74e7..d48b09c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,10 +41,10 @@ else() set(DEBUG "false") endif() -configure_file("include/config.hpp.in" "include/config.hpp") -configure_file("include/config.hpp.in" "${CMAKE_CURRENT_BINARY_DIR}/config.hpp") -configure_file("include/config.hpp.in" "${CMAKE_CURRENT_SOURCE_DIR}/config.hpp") -configure_file("include/config.hpp.in" "${BUILD_DIR}/config.hpp") +configure_file("include/config.hpp.in" "include/config.hpp")# implies building is done from project root +#configure_file("include/config.hpp.in" "${CMAKE_CURRENT_BINARY_DIR}/config.hpp") +#configure_file("include/config.hpp.in" "${CMAKE_CURRENT_SOURCE_DIR}/config.hpp") +configure_file("include/config.hpp.in" "${BUILD_DIR}/include/config.hpp") # ---------------------------------------------------------------------- @@ -79,8 +79,8 @@ add_custom_target(tidy DEPENDS make_tidy ) add_subdirectory(src) include_directories(include) -include_directories(${BUILD_DIR}) -include_directories(${BUILD_DIR}"/include") +#include_directories(${BUILD_DIR}) +include_directories("${BUILD_DIR}/include") add_definitions(-std=c++14) # Boost diff --git a/src/main.cpp b/src/main.cpp index 9883aff7..af179ce0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -40,6 +40,7 @@ void usage_view(void) std::cout << "View FASTAFS file in FASTA format" << std::endl << std::endl; std::cout << " -f, --file Provide fastafs by file path, not from database (cache)" << std::endl; std::cout << " -p, --padding Number of nucleotides before delimited with a newline [default=60]" << std::endl; + std::cout << " -m, --no-masking Disable masking; bases in lower-case (not for 2bit output)" << std::endl; std::cout << " -2, --2bit View in UCSC twoBit/2bit format" << std::endl; std::cout << " http://genome.ucsc.edu/FAQ/FAQformat.html#format7" << std::endl; std::cout << std::endl; @@ -118,7 +119,7 @@ int main(int argc, char *argv[]) try { fasta_to_twobit_fastafs(argv[argc - 1], fname_out); } - catch(std::runtime_error) { + catch(std::runtime_error& e) { fasta_to_fourbit_fastafs(argv[argc - 1], fname_out); } } else if(is_ucsc2bit_file(argv[argc - 1])) { From 9a744644b0640c3da8632795d8e98fce238fb6fd Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 29 Jan 2020 20:45:07 +0100 Subject: [PATCH 115/119] update changelog and implements -m/--no-masking argument --- Changelog | 1 + src/fastafs.cpp | 12 ++++++------ src/fuse.cpp | 28 +++++++++++++++++++++++++--- src/main.cpp | 8 +++++--- src/utils.cpp | 22 ++++++++++++---------- 5 files changed, 49 insertions(+), 22 deletions(-) diff --git a/Changelog b/Changelog index 2b8276e7..b284a90a 100644 --- a/Changelog +++ b/Changelog @@ -5,6 +5,7 @@ * Random access retrievement via file system: `/seq/chr1:100-200` * CRC32 checksums for file integratity * converting to meson because of insane build times using cmake+make and re-building files that have not changed + * `fastafs view|mount -m/--no-masking` virtualises fasta files without masking (uppercase) 2019-09-06 Youri Hoogstrate diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 8848f638..1f5549d2 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -492,10 +492,10 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) // half iteration remainder = this->n % chunk_size; if this number > 0; do it too for(uint32_t i = 0; i < n_iterations; i++) { written = this->view_fasta_chunk(cache, chunk, - chunksize, - header_offset + (i * chunksize), - fh); - + chunksize, + header_offset + (i * chunksize), + fh); + if(this->flags.is_fourbit()) { written = remove_chars(chunk, '-', written); } @@ -505,11 +505,11 @@ std::string fastafs_seq::md5(ffs2f_init_seq* cache, std::ifstream *fh) if(remaining_bytes > 0) { written = this->view_fasta_chunk(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh); - + if(this->flags.is_fourbit()) { written = remove_chars(chunk, '-', written); } - + MD5_Update(&ctx, chunk, written); chunk[remaining_bytes] = '\0'; } diff --git a/src/fuse.cpp b/src/fuse.cpp index 2208782d..ea71b29c 100644 --- a/src/fuse.cpp +++ b/src/fuse.cpp @@ -42,6 +42,7 @@ struct fuse_instance { // generic uint32_t padding; + bool allow_masking; int argc_fuse; }; @@ -365,6 +366,7 @@ void print_fuse_help() std::cout << "\n"; std::cout << "general options:\n"; std::cout << " -2 --2bit virtualise a 2bit file rather than FASTAFS UID\n"; + std::cout << " -m --no-masking Disable masking; bases in lower-case (not for 2bit output)\n"; std::cout << " -p ,--padding padding / FASTA line length\n"; std::cout << " -o opt,[opt...] mount options\n"; std::cout << " -h --help print help\n"; @@ -439,7 +441,18 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) //fastafs_fuse_instance *ffi = new fastafs_fuse_instance({nullptr, 60, 1, new char[argc]}); //fastafs_fuse_instance *ffi = new fastafs_fuse_instance({nullptr, 60, 0, nullptr}); - fuse_instance *fi = new fuse_instance({nullptr, nullptr, nullptr, true, nullptr, 60, 0}); + + + fuse_instance *fi = new fuse_instance({ + nullptr, // pointer to fastafs decoder - if from_fasta is set to true + nullptr,// pointer to fastafs_init with defined padding + nullptr, // pointer to fastafs_init with cache size of 0 (for mounting ./seq/chr1:123-456 + true, // from fastafs + nullptr, // pointer to ucsc2bit decoder - if from_fasta is set to false + 60, // default_padding + true, // allow_masking + 0 // argc_fuse + }); //fuse option variable to send to fuse argv_fuse[fi->argc_fuse++] = (char *) "fastafs"; // becomes fuse.fastafs @@ -479,6 +492,9 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) case '2': // a fastafs specific flag fi->from_fastafs = false; break; + case 'm': // disable masking + fi->allow_masking = false; + break; case 'f': // fuse specific flags case 's': @@ -492,7 +508,13 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) break; default: // argument, fastafs spcific (such as '-p' followed by '50') - current_argument = argv[i][1]; + if(strcmp(argv[i], "--2bit") == 0) { + fi->from_fastafs = false;; + } else if(strcmp(argv[i], "--no-masking") == 0) { + fi->allow_masking = false; + } else { + current_argument = argv[i][1]; + } break; } } else { @@ -529,7 +551,7 @@ fuse_instance *parse_args(int argc, char **argv, char **argv_fuse) fi->f = new fastafs(name); fi->f->load(fname); - fi->cache = fi->f->init_ffs2f(fi->padding, true);// allow mixed case + fi->cache = fi->f->init_ffs2f(fi->padding, fi->allow_masking); fi->cache_p0 = fi->f->init_ffs2f(0, true);// allow mixed case } else { std::string basename = basename_cpp(std::string(argv[mount_target_arg])); diff --git a/src/main.cpp b/src/main.cpp index af179ce0..5033a5f8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -118,8 +118,7 @@ int main(int argc, char *argv[]) if(is_fasta_file(argv[argc - 1])) { try { fasta_to_twobit_fastafs(argv[argc - 1], fname_out); - } - catch(std::runtime_error& e) { + } catch(std::runtime_error& e) { fasta_to_fourbit_fastafs(argv[argc - 1], fname_out); } } else if(is_ucsc2bit_file(argv[argc - 1])) { @@ -145,6 +144,7 @@ int main(int argc, char *argv[]) bool from_file = false; bool skip_argument = false; + bool allow_masking = true;// allow upper and lower case for(int i = 2; i < argc - 1; i++) { if(skip_argument) { @@ -152,6 +152,8 @@ int main(int argc, char *argv[]) } else { if(strcmp(argv[i], "-f") == 0 or strcmp(argv[i], "--file") == 0) { from_file = true; + } else if(strcmp(argv[i], "-m") == 0 or strcmp(argv[i], "--no-masking") == 0) { + allow_masking = false; } else if((strcmp(argv[i], "-p") == 0 or strcmp(argv[i], "--padding") == 0) and i + 1 < argc - 1) { try { sscanf(argv[++i], "%u", &padding); @@ -194,7 +196,7 @@ int main(int argc, char *argv[]) written = f.view_ucsc2bit_chunk(buffer, READ_BUFFER_SIZE, offset); } } else { - ffs2f_init* cache = f.init_ffs2f(padding, true); + ffs2f_init* cache = f.init_ffs2f(padding, allow_masking); f.view_fasta(cache);//@todo make argument parsing delete cache; diff --git a/src/utils.cpp b/src/utils.cpp index cae392cc..145371b0 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -63,18 +63,20 @@ void uint_to_twobytes(char *chars, uint16_t n) -size_t remove_chars(char *s, int c, size_t l){ +size_t remove_chars(char *s, int c, size_t l) +{ size_t j = 0; - size_t n = l; - - for (size_t i=j=0; i < n; i++) - if (s[i] != c) - s[j++] = s[i]; - - s[j] = '\0'; - + size_t n = l; + + for(size_t i = j = 0; i < n; i++) + if(s[i] != c) { + s[j++] = s[i]; + } + + s[j] = '\0'; + return j; -} +} void uint_to_fourbytes(char *chars, uint32_t n) From ac1f1a8206de189739598e16a67f4e4733d75f2c Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 29 Jan 2020 21:05:28 +0100 Subject: [PATCH 116/119] small fixes --- build-release.sh | 5 +++-- src/sequence_region.cpp | 2 +- src/utils.cpp | 2 +- test/sequenceregion/test_sequenceregion.cpp | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/build-release.sh b/build-release.sh index aa015011..8f32b5c1 100755 --- a/build-release.sh +++ b/build-release.sh @@ -1,5 +1,6 @@ #!/bin/bash -cmake -GNinja -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . +#cmake -GNinja -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . +cmake -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . make "$@" -j `nproc` -sudo make install +make install diff --git a/src/sequence_region.cpp b/src/sequence_region.cpp index 6ec5b34c..4f7a2015 100644 --- a/src/sequence_region.cpp +++ b/src/sequence_region.cpp @@ -35,7 +35,7 @@ void sequence_region::parse(const char * seqstr) if(p > 0) { this->seq_name = std::string(seqstr, 0, p); - } else if(p == false) { + } else if(proceed == true) { // either with string > 255 chars or string smaller than 255 without ':' this->seq_name = std::string(seqstr, 0, string_max_pos); } diff --git a/src/utils.cpp b/src/utils.cpp index 145371b0..6c565207 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -191,7 +191,7 @@ bool is_fasta_file(char *filename) bool is_ucsc2bit_file(char *filename) { - char buf[2]; + char buf[4+1]; FILE *fp; if((fp = fopen(filename, "rb")) == NULL) { diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp index 8d2857db..6498a33a 100644 --- a/test/sequenceregion/test_sequenceregion.cpp +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -246,9 +246,9 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) char arg[] = "/seq/chrRr1:456-123"; sequence_region *sr = nullptr; - if(sr == nullptr) {// compiler doesn't understand this otherwise + //if(sr == nullptr) {// compiler doesn't understand this otherwise BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); - } + //} } } From d2a86f00e8fa81adefa5521de923100fa03019b7 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 29 Jan 2020 21:13:42 +0100 Subject: [PATCH 117/119] sav --- build-debug.sh | 2 +- build-release.sh | 3 ++- include/fasta_to_twobit_fastafs.hpp | 2 +- src/fasta_to_twobit_fastafs.cpp | 2 +- test/cache/test_cache_fourbit.cpp | 2 ++ test/sequenceregion/test_sequenceregion.cpp | 3 ++- 6 files changed, 9 insertions(+), 5 deletions(-) diff --git a/build-debug.sh b/build-debug.sh index 9f4ab973..eec58e92 100755 --- a/build-debug.sh +++ b/build-debug.sh @@ -7,5 +7,5 @@ ## using make - sometimes much slower cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_INSTALL_PREFIX=~/.local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . -make "$@" -j `nproc` +make "$@" -j $(nproc) make install diff --git a/build-release.sh b/build-release.sh index 8f32b5c1..5f240a76 100755 --- a/build-release.sh +++ b/build-release.sh @@ -1,6 +1,7 @@ #!/bin/bash #cmake -GNinja -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . + cmake -DCMAKE_BUILD_TYPE=release -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON . -make "$@" -j `nproc` +make "$@" -j $(nproc) make install diff --git a/include/fasta_to_twobit_fastafs.hpp b/include/fasta_to_twobit_fastafs.hpp index 5bb7306d..4716397b 100644 --- a/include/fasta_to_twobit_fastafs.hpp +++ b/include/fasta_to_twobit_fastafs.hpp @@ -57,5 +57,5 @@ class fasta_seq_header_twobit_conversion_data }; -size_t fasta_to_twobit_fastafs(const std::string, const std::string); +size_t fasta_to_twobit_fastafs(const std::string, const std::string &); diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index 27b10a27..8e98293e 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -117,7 +117,7 @@ void fasta_seq_header_twobit_conversion_data::finish_sequence(std::ofstream &fh_ -size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string fastafs_file) +size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string &fastafs_file) { std::vector index; fasta_seq_header_twobit_conversion_data* s; diff --git a/test/cache/test_cache_fourbit.cpp b/test/cache/test_cache_fourbit.cpp index 42c0c4b1..d71e396e 100644 --- a/test/cache/test_cache_fourbit.cpp +++ b/test/cache/test_cache_fourbit.cpp @@ -202,10 +202,12 @@ BOOST_AUTO_TEST_CASE(test_cache) for(unsigned int i = 0; i < size; i++) { BOOST_CHECK_EQUAL(buffer[i], reference[i]); + /* if(reference[i] != buffer[i]) { printf("comparing char %u\n", i); printf(" ** mismatch [%d] [ref] %d != [buf] %d (%c x %02hhX)\n", i, reference[i], buffer[i], buffer[i], buffer[i]); } + */ } diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp index 6498a33a..4114a8f6 100644 --- a/test/sequenceregion/test_sequenceregion.cpp +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -166,10 +166,11 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) } { - char arg[] = "/seq/chrRr1:1235-1234"; sequence_region *sr = nullptr; if(sr == nullptr) {// compiler doesn't understand this otherwise + char arg[] = "/seq/chrRr1:1235-1234"; + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); } } From b34851cfe8495af4753a8e23e3139cdb3548ec23 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 29 Jan 2020 21:27:35 +0100 Subject: [PATCH 118/119] small fixes --- include/fasta_to_twobit_fastafs.hpp | 4 ++-- include/ucsc2bit_to_fastafs.hpp | 3 +++ src/fasta_to_fourbit_fastafs.cpp | 8 ++++---- src/fasta_to_twobit_fastafs.cpp | 2 +- src/fastafs.cpp | 2 +- src/utils.cpp | 2 +- test/sequenceregion/test_sequenceregion.cpp | 4 ++-- 7 files changed, 14 insertions(+), 11 deletions(-) diff --git a/include/fasta_to_twobit_fastafs.hpp b/include/fasta_to_twobit_fastafs.hpp index 4716397b..9199548a 100644 --- a/include/fasta_to_twobit_fastafs.hpp +++ b/include/fasta_to_twobit_fastafs.hpp @@ -27,7 +27,7 @@ class fasta_seq_header_twobit_conversion_data bool previous_was_N; - fasta_seq_header_twobit_conversion_data(off_t fof, std::string name): + fasta_seq_header_twobit_conversion_data(off_t fof, const std::string &name): file_offset_in_fasta(fof), name(name), N(0), @@ -57,5 +57,5 @@ class fasta_seq_header_twobit_conversion_data }; -size_t fasta_to_twobit_fastafs(const std::string, const std::string &); +size_t fasta_to_twobit_fastafs(const std::string &, const std::string &); diff --git a/include/ucsc2bit_to_fastafs.hpp b/include/ucsc2bit_to_fastafs.hpp index 0c078b15..f5f3ec1d 100644 --- a/include/ucsc2bit_to_fastafs.hpp +++ b/include/ucsc2bit_to_fastafs.hpp @@ -27,6 +27,9 @@ struct ucsc2bit_seq_header { uint32_t m_blocks; std::vector m_block_starts; std::vector m_block_sizes; + + ucsc2bit_seq_header(): + name_size(0), name(nullptr) { } }; struct ucsc2bit_seq_header_conversion_data { diff --git a/src/fasta_to_fourbit_fastafs.cpp b/src/fasta_to_fourbit_fastafs.cpp index dc33ab89..1f9ff5c5 100644 --- a/src/fasta_to_fourbit_fastafs.cpp +++ b/src/fasta_to_fourbit_fastafs.cpp @@ -571,7 +571,7 @@ size_t fasta_to_fourbit_fastafs(const std::string &fasta_file, const std::string fh_fastafs.seekp(0, std::ios::end); - printf("file size now: %i\n", (uint32_t) fh_fastafs.tellp()); + //printf("file size now: %i\n", (uint32_t) fh_fastafs.tellp()); @@ -600,15 +600,15 @@ size_t fasta_to_fourbit_fastafs(const std::string &fasta_file, const std::string } // -- - printf("nnn = %i\n", nnn); + //printf("nnn = %i\n", nnn); //write crc as 4 bytes char byte_enc[5] = "\x00\x00\x00\x00"; uint_to_fourbytes(byte_enc, (uint32_t) crc); - printf("[%i][%i][%i][%i] ~ %02hhx%02hhx%02hhx%02hhx \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3], - byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); + //printf("[%i][%i][%i][%i] ~ %02hhx%02hhx%02hhx%02hhx \n", byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3], + // byte_enc[0], byte_enc[1], byte_enc[2], byte_enc[3]); fh_fastafs.write(reinterpret_cast(&byte_enc), (size_t) 4); diff --git a/src/fasta_to_twobit_fastafs.cpp b/src/fasta_to_twobit_fastafs.cpp index 8e98293e..7e673171 100644 --- a/src/fasta_to_twobit_fastafs.cpp +++ b/src/fasta_to_twobit_fastafs.cpp @@ -117,7 +117,7 @@ void fasta_seq_header_twobit_conversion_data::finish_sequence(std::ofstream &fh_ -size_t fasta_to_twobit_fastafs(const std::string fasta_file, const std::string &fastafs_file) +size_t fasta_to_twobit_fastafs(const std::string &fasta_file, const std::string &fastafs_file) { std::vector index; fasta_seq_header_twobit_conversion_data* s; diff --git a/src/fastafs.cpp b/src/fastafs.cpp index 1f5549d2..3b7de434 100644 --- a/src/fastafs.cpp +++ b/src/fastafs.cpp @@ -55,7 +55,7 @@ uint32_t fastafs_seq::fasta_filesize(uint32_t padding) { #if DEBUG if(padding == 0) { - throw std::invalid_argument("Padding is set to 0, should have been set to this->n elsewhere.\n"); + throw std::invalid_argument("[fastafs_seq::fasta_filesize] Padding is set to 0, should have been set to this->n elsewhere.\n"); } #endif // > chr \n ACTG NNN /number of newlines corresponding to ACTG NNN lines diff --git a/src/utils.cpp b/src/utils.cpp index 6c565207..f961a882 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -191,7 +191,7 @@ bool is_fasta_file(char *filename) bool is_ucsc2bit_file(char *filename) { - char buf[4+1]; + char buf[4 + 1]; FILE *fp; if((fp = fopen(filename, "rb")) == NULL) { diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp index 4114a8f6..a5f02f2b 100644 --- a/test/sequenceregion/test_sequenceregion.cpp +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -170,7 +170,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) sequence_region *sr = nullptr; if(sr == nullptr) {// compiler doesn't understand this otherwise char arg[] = "/seq/chrRr1:1235-1234"; - + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); } } @@ -248,7 +248,7 @@ BOOST_AUTO_TEST_CASE(test_sequence_region2) sequence_region *sr = nullptr; //if(sr == nullptr) {// compiler doesn't understand this otherwise - BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); + BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); //} } From 3607e5c6e265d3c5440865f3cde9bf5fd7412607 Mon Sep 17 00:00:00 2001 From: yhoogstrate Date: Wed, 29 Jan 2020 21:56:20 +0100 Subject: [PATCH 119/119] release notes v1.7.0 --- CMakeLists.txt | 56 ++++++++++----------- Changelog | 17 +++++-- include/ucsc2bit_to_fastafs.hpp | 2 +- test/sequenceregion/test_sequenceregion.cpp | 4 +- 4 files changed, 45 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d48b09c5..2c939c8c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,32 +142,32 @@ add_executable(mount.fastafs ) set_target_properties(mount.fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}") -#add_library(libfastafs SHARED - #src/fasta_to_twobit_fastafs.cpp - #src/ucsc2bit_to_fastafs.cpp - #src/flags.cpp - #src/fastafs.cpp - #src/ucsc2bit.cpp - #src/twobit_byte.cpp - #src/fourbit_byte.cpp - #src/database.cpp - #src/utils.cpp - #src/fuse.cpp - #src/lsfastafs.cpp -#) -#target_include_directories(libfastafs PUBLIC include) -#target_sources(libfastafs PUBLIC include/fastafs.hpp) - -#set_target_properties(libfastafs PROPERTIES LIBRARY_OUTPUT_DIRECTORY "lib") -#set_target_properties(libfastafs PROPERTIES VERSION ${PROJECT_VERSION}) -#set_target_properties(libfastafs PROPERTIES SOVERSION 1) -#set_target_properties(libfastafs PROPERTIES OUTPUT_NAME fastafs) - -#set_target_properties(libfastafs PROPERTIES HEADER_OUTPUT_DIRECTORY "include") -# great, this doesn't go automagically with an entire dir -#set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_twobit_fastafs.hpp;include/fuse.hpp;include/meson.build;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp") -#set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_DIRECTORY include) -#set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_OUTPUT_DIRECTORY "include") +add_library(libfastafs SHARED + src/fasta_to_twobit_fastafs.cpp + src/ucsc2bit_to_fastafs.cpp + src/flags.cpp + src/fastafs.cpp + src/ucsc2bit.cpp + src/twobit_byte.cpp + src/fourbit_byte.cpp + src/database.cpp + src/utils.cpp + src/fuse.cpp + src/lsfastafs.cpp +) +target_include_directories(libfastafs PUBLIC include) +target_sources(libfastafs PUBLIC include/fastafs.hpp) + +set_target_properties(libfastafs PROPERTIES LIBRARY_OUTPUT_DIRECTORY "lib") +set_target_properties(libfastafs PROPERTIES VERSION ${PROJECT_VERSION}) +set_target_properties(libfastafs PROPERTIES SOVERSION 1) +set_target_properties(libfastafs PROPERTIES OUTPUT_NAME fastafs) + +##set_target_properties(libfastafs PROPERTIES HEADER_OUTPUT_DIRECTORY "include") +## great, this doesn't go automagically with an entire dir +set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_fourbit_fastafs.hpp;include/fasta_to_twobit_fastafs.hpp;include/flags.hpp;include/fourbit_byte.hpp;include/fuse.hpp;include/lsfastafs.hpp;include/sequence_region.hpp;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp") +##set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_DIRECTORY include) +##set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_OUTPUT_DIRECTORY "include") # ---------------------------------------------------------------------- # ------------------------------ Testing ------------------------------- @@ -182,7 +182,7 @@ add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-) add_test(test_cache_twobit "${BUILD_TEST_DIR}/test_cache_twobit") add_test(test_cache_fourbit "${BUILD_TEST_DIR}/test_cache_fourbit") -#add_test(test_view "${BUILD_TEST_DIR}/test_view") +add_test(test_view "${BUILD_TEST_DIR}/test_view") add_test(test_flags "${BUILD_TEST_DIR}/test_flags") add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs") add_test(test_check "${BUILD_TEST_DIR}/test_check") # file integrity checks @@ -204,6 +204,6 @@ install(TARGETS fastafs DESTINATION "bin") install(TARGETS mount.fastafs DESTINATION "bin") # don't build during debug at least -#install(TARGETS libfastafs LIBRARY DESTINATION "lib" PUBLIC_HEADER DESTINATION "include/libfastafs") +install(TARGETS libfastafs LIBRARY DESTINATION "lib" PUBLIC_HEADER DESTINATION "include/libfastafs") # ---------------------------------------------------------------------- diff --git a/Changelog b/Changelog index b284a90a..59257bf3 100644 --- a/Changelog +++ b/Changelog @@ -2,10 +2,21 @@ * v1.7.0 * `fastafs cache -o` for custom output files and bypassing the config - * Random access retrievement via file system: `/seq/chr1:100-200` - * CRC32 checksums for file integratity - * converting to meson because of insane build times using cmake+make and re-building files that have not changed + * Random access subsequence retrieval diretly via filesystem: `/seq/chr1:100-200` + * Implements CRC32 checksums for whole-file integritity + * Converting to meson because of insane build times using cmake+make and re-building files that have not changed * `fastafs view|mount -m/--no-masking` virtualises fasta files without masking (uppercase) + * Minor support for building with meson and ninja + * cmake template allows building for guix (+guix file provided) + * Changed requirement from c++17 on c++14 to avoid large compatibility issues + * Implements bitflags with corresponding class + * Implements fourbit (and automatically switches over if non ACTGUN chars are found + * Implements functions `is_fasta_file`, and `is_ucsc2bit_file` using file MAGIC + * Creates by FASTAFS files that are first flagged as incomplete, that are unflagged after conversion has completed + * MD5sums working for fourbit compressed sequences + * Implements `fastafs cache -o` to export to desired output fastafs file + * Adds compression type to `fastafs list` output + * More and improved testing, including file integrity detection 2019-09-06 Youri Hoogstrate diff --git a/include/ucsc2bit_to_fastafs.hpp b/include/ucsc2bit_to_fastafs.hpp index f5f3ec1d..051abfb1 100644 --- a/include/ucsc2bit_to_fastafs.hpp +++ b/include/ucsc2bit_to_fastafs.hpp @@ -29,7 +29,7 @@ struct ucsc2bit_seq_header { std::vector m_block_sizes; ucsc2bit_seq_header(): - name_size(0), name(nullptr) { } + name_size(0), name(nullptr), n_blocks(0) { } }; struct ucsc2bit_seq_header_conversion_data { diff --git a/test/sequenceregion/test_sequenceregion.cpp b/test/sequenceregion/test_sequenceregion.cpp index a5f02f2b..5513761b 100644 --- a/test/sequenceregion/test_sequenceregion.cpp +++ b/test/sequenceregion/test_sequenceregion.cpp @@ -168,11 +168,11 @@ BOOST_AUTO_TEST_CASE(test_sequence_region3) { sequence_region *sr = nullptr; - if(sr == nullptr) {// compiler doesn't understand this otherwise + //if(sr == nullptr) {// compiler doesn't understand this otherwise char arg[] = "/seq/chrRr1:1235-1234"; BOOST_CHECK_THROW(sr = new sequence_region(&(arg[5])), std::invalid_argument); - } + //} } }