Skip to content

Commit

Permalink
Merge pull request #55 from yhoogstrate/polyN4bit
Browse files Browse the repository at this point in the history
Poly n4bit
  • Loading branch information
yhoogstrate authored Feb 25, 2020
2 parents b7b66e6 + 53e6b1d commit c2d8438
Show file tree
Hide file tree
Showing 36 changed files with 2,300 additions and 1,519 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,5 @@ xcheck.sh
.ninja*
test-mount.sh
*.dict
*.pyc
.venv
17 changes: 8 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ project(fastafs)
# Do this once in a while - find different bugs
#set(CMAKE_CXX_COMPILER "clang++")

set(PROJECT_VERSION "1.7.1")
set(PROJECT_VERSION "1.7.2")
set(PACKAGE_URL "https://github.com/yhoogstrate/fastafs")
set(PACKAGE_BUGREPORT "${PACKAGE_URL}/issues")

Expand Down Expand Up @@ -101,15 +101,14 @@ link_libraries(z)# zlib; -lz; for crc32 checks on whole file integrity


if(DEBUG)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Og -Wall -g -ggdb -Wconversion -D_FILE_OFFSET_BITS=64")# -Werror makes compilation crash when warnings are given (also part of Travis)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -Wall -g -ggdb -Wconversion -D_FILE_OFFSET_BITS=64")# -Werror makes compilation crash when warnings are given (also part of Travis)
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -D_FILE_OFFSET_BITS=64")
endif()

add_executable(fastafs
src/main.cpp
src/fasta_to_twobit_fastafs.cpp
src/fasta_to_fourbit_fastafs.cpp
src/fasta_to_fastafs.cpp
src/ucsc2bit_to_fastafs.cpp
src/flags.cpp
src/fastafs.cpp
Expand All @@ -127,7 +126,7 @@ set_target_properties(fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}"
# mount-only binary, without all the other stuff 'mount.fastafs' [for fstab]
add_executable(mount.fastafs
src/main_mount.cpp
src/fasta_to_twobit_fastafs.cpp
src/fasta_to_fastafs.cpp
src/ucsc2bit_to_fastafs.cpp
src/flags.cpp
src/fastafs.cpp
Expand All @@ -143,7 +142,7 @@ add_executable(mount.fastafs
set_target_properties(mount.fastafs PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${BUILD_DIR}")

add_library(libfastafs SHARED
src/fasta_to_twobit_fastafs.cpp
src/fasta_to_fastafs.cpp
src/ucsc2bit_to_fastafs.cpp
src/flags.cpp
src/fastafs.cpp
Expand All @@ -165,7 +164,7 @@ set_target_properties(libfastafs PROPERTIES OUTPUT_NAME fastafs)

##set_target_properties(libfastafs PROPERTIES HEADER_OUTPUT_DIRECTORY "include")
## great, this doesn't go automagically with an entire dir
set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_fourbit_fastafs.hpp;include/fasta_to_twobit_fastafs.hpp;include/flags.hpp;include/fourbit_byte.hpp;include/fuse.hpp;include/lsfastafs.hpp;include/sequence_region.hpp;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp")
set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER "include/config.hpp;include/database.hpp;include/fastafs.hpp;include/fasta_to_fastafs.hpp;include/flags.hpp;include/fourbit_byte.hpp;include/fuse.hpp;include/lsfastafs.hpp;include/sequence_region.hpp;include/twobit_byte.hpp;include/ucsc2bit.hpp;include/ucsc2bit_to_fastafs.hpp;include/utils.hpp")
##set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_DIRECTORY include)
##set_target_properties(libfastafs PROPERTIES PUBLIC_HEADER_OUTPUT_DIRECTORY "include")

Expand All @@ -180,8 +179,7 @@ add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) # 'make check' as al

add_test(test_twobit_byte "${BUILD_TEST_DIR}/test_twobit_byte") # ACTG(N) | ACUG(N)
add_test(test_fourbit_byte "${BUILD_TEST_DIR}/test_fourbit_byte") # ACGTURYKMSWBDHVN(-)
add_test(test_cache_twobit "${BUILD_TEST_DIR}/test_cache_twobit")
add_test(test_cache_fourbit "${BUILD_TEST_DIR}/test_cache_fourbit")
add_test(test_cache "${BUILD_TEST_DIR}/test_cache")
add_test(test_view "${BUILD_TEST_DIR}/test_view")
add_test(test_flags "${BUILD_TEST_DIR}/test_flags")
add_test(test_fastafs "${BUILD_TEST_DIR}/test_fastafs")
Expand All @@ -191,6 +189,7 @@ add_test(test_ucsc2bit_to_fastafs "${BUILD_TEST_DIR}/test_ucsc2bit_to_fastafs")
add_test(test_ucsc2bit_as_fasta "${BUILD_TEST_DIR}/test_ucsc2bit_as_fasta")
add_test(test_sequenceregion "${BUILD_TEST_DIR}/test_sequenceregion")
add_test(test_utils "${BUILD_TEST_DIR}/test_utils")
add_test(test_functional "test/test_functional.py")
#add_test(test_tree "${BUILD_TEST_DIR}/test_tree")

#find_program(CTEST_MEMORYCHECK_COMMAND NAMES valgrind) # 'ctest -T memcheck'
Expand Down
8 changes: 8 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
2020-02-04 Youri Hoogstrate

* v1.7.2
* Reduce file read syscalls and gain performance
* Functional testing in python, for both view to stdout as mounting
* Added templating for cleaner code
* Auto conversion to fourbit if that is more efficient compression

2020-02-01 Youri Hoogstrate

* v1.7.1
Expand Down
8 changes: 8 additions & 0 deletions include/config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,12 @@ static const std::string FASTAFS_PID_XATTR_NAME = "fastafs-pid";
static const size_t MAX_SIZE_SEQ_NAME = 255;


const static char DICT_TWOBIT = 1;
const static char DICT_FOURBIT = 2;


static const char ENCODE_HASH_TWOBIT_DNA[256][5] = {"TTTT", "TTTC", "TTTA", "TTTG", "TTCT", "TTCC", "TTCA", "TTCG", "TTAT", "TTAC", "TTAA", "TTAG", "TTGT", "TTGC", "TTGA", "TTGG", "TCTT", "TCTC", "TCTA", "TCTG", "TCCT", "TCCC", "TCCA", "TCCG", "TCAT", "TCAC", "TCAA", "TCAG", "TCGT", "TCGC", "TCGA", "TCGG", "TATT", "TATC", "TATA", "TATG", "TACT", "TACC", "TACA", "TACG", "TAAT", "TAAC", "TAAA", "TAAG", "TAGT", "TAGC", "TAGA", "TAGG", "TGTT", "TGTC", "TGTA", "TGTG", "TGCT", "TGCC", "TGCA", "TGCG", "TGAT", "TGAC", "TGAA", "TGAG", "TGGT", "TGGC", "TGGA", "TGGG", "CTTT", "CTTC", "CTTA", "CTTG", "CTCT", "CTCC", "CTCA", "CTCG", "CTAT", "CTAC", "CTAA", "CTAG", "CTGT", "CTGC", "CTGA", "CTGG", "CCTT", "CCTC", "CCTA", "CCTG", "CCCT", "CCCC", "CCCA", "CCCG", "CCAT", "CCAC", "CCAA", "CCAG", "CCGT", "CCGC", "CCGA", "CCGG", "CATT", "CATC", "CATA", "CATG", "CACT", "CACC", "CACA", "CACG", "CAAT", "CAAC", "CAAA", "CAAG", "CAGT", "CAGC", "CAGA", "CAGG", "CGTT", "CGTC", "CGTA", "CGTG", "CGCT", "CGCC", "CGCA", "CGCG", "CGAT", "CGAC", "CGAA", "CGAG", "CGGT", "CGGC", "CGGA", "CGGG", "ATTT", "ATTC", "ATTA", "ATTG", "ATCT", "ATCC", "ATCA", "ATCG", "ATAT", "ATAC", "ATAA", "ATAG", "ATGT", "ATGC", "ATGA", "ATGG", "ACTT", "ACTC", "ACTA", "ACTG", "ACCT", "ACCC", "ACCA", "ACCG", "ACAT", "ACAC", "ACAA", "ACAG", "ACGT", "ACGC", "ACGA", "ACGG", "AATT", "AATC", "AATA", "AATG", "AACT", "AACC", "AACA", "AACG", "AAAT", "AAAC", "AAAA", "AAAG", "AAGT", "AAGC", "AAGA", "AAGG", "AGTT", "AGTC", "AGTA", "AGTG", "AGCT", "AGCC", "AGCA", "AGCG", "AGAT", "AGAC", "AGAA", "AGAG", "AGGT", "AGGC", "AGGA", "AGGG", "GTTT", "GTTC", "GTTA", "GTTG", "GTCT", "GTCC", "GTCA", "GTCG", "GTAT", "GTAC", "GTAA", "GTAG", "GTGT", "GTGC", "GTGA", "GTGG", "GCTT", "GCTC", "GCTA", "GCTG", "GCCT", "GCCC", "GCCA", "GCCG", "GCAT", "GCAC", "GCAA", "GCAG", "GCGT", "GCGC", "GCGA", "GCGG", "GATT", "GATC", "GATA", "GATG", "GACT", "GACC", "GACA", "GACG", "GAAT", "GAAC", "GAAA", "GAAG", "GAGT", "GAGC", "GAGA", "GAGG", "GGTT", "GGTC", "GGTA", "GGTG", "GGCT", "GGCC", "GGCA", "GGCG", "GGAT", "GGAC", "GGAA", "GGAG", "GGGT", "GGGC", "GGGA", "GGGG"};
static const char ENCODE_HASH_TWOBIT_RNA[256][5] = {"UUUU", "UUUC", "UUUA", "UUUG", "UUCU", "UUCC", "UUCA", "UUCG", "UUAU", "UUAC", "UUAA", "UUAG", "UUGU", "UUGC", "UUGA", "UUGG", "UCUU", "UCUC", "UCUA", "UCUG", "UCCU", "UCCC", "UCCA", "UCCG", "UCAU", "UCAC", "UCAA", "UCAG", "UCGU", "UCGC", "UCGA", "UCGG", "UAUU", "UAUC", "UAUA", "UAUG", "UACU", "UACC", "UACA", "UACG", "UAAU", "UAAC", "UAAA", "UAAG", "UAGU", "UAGC", "UAGA", "UAGG", "UGUU", "UGUC", "UGUA", "UGUG", "UGCU", "UGCC", "UGCA", "UGCG", "UGAU", "UGAC", "UGAA", "UGAG", "UGGU", "UGGC", "UGGA", "UGGG", "CUUU", "CUUC", "CUUA", "CUUG", "CUCU", "CUCC", "CUCA", "CUCG", "CUAU", "CUAC", "CUAA", "CUAG", "CUGU", "CUGC", "CUGA", "CUGG", "CCUU", "CCUC", "CCUA", "CCUG", "CCCU", "CCCC", "CCCA", "CCCG", "CCAU", "CCAC", "CCAA", "CCAG", "CCGU", "CCGC", "CCGA", "CCGG", "CAUU", "CAUC", "CAUA", "CAUG", "CACU", "CACC", "CACA", "CACG", "CAAU", "CAAC", "CAAA", "CAAG", "CAGU", "CAGC", "CAGA", "CAGG", "CGUU", "CGUC", "CGUA", "CGUG", "CGCU", "CGCC", "CGCA", "CGCG", "CGAU", "CGAC", "CGAA", "CGAG", "CGGU", "CGGC", "CGGA", "CGGG", "AUUU", "AUUC", "AUUA", "AUUG", "AUCU", "AUCC", "AUCA", "AUCG", "AUAU", "AUAC", "AUAA", "AUAG", "AUGU", "AUGC", "AUGA", "AUGG", "ACUU", "ACUC", "ACUA", "ACUG", "ACCU", "ACCC", "ACCA", "ACCG", "ACAU", "ACAC", "ACAA", "ACAG", "ACGU", "ACGC", "ACGA", "ACGG", "AAUU", "AAUC", "AAUA", "AAUG", "AACU", "AACC", "AACA", "AACG", "AAAU", "AAAC", "AAAA", "AAAG", "AAGU", "AAGC", "AAGA", "AAGG", "AGUU", "AGUC", "AGUA", "AGUG", "AGCU", "AGCC", "AGCA", "AGCG", "AGAU", "AGAC", "AGAA", "AGAG", "AGGU", "AGGC", "AGGA", "AGGG", "GUUU", "GUUC", "GUUA", "GUUG", "GUCU", "GUCC", "GUCA", "GUCG", "GUAU", "GUAC", "GUAA", "GUAG", "GUGU", "GUGC", "GUGA", "GUGG", "GCUU", "GCUC", "GCUA", "GCUG", "GCCU", "GCCC", "GCCA", "GCCG", "GCAU", "GCAC", "GCAA", "GCAG", "GCGU", "GCGC", "GCGA", "GCGG", "GAUU", "GAUC", "GAUA", "GAUG", "GACU", "GACC", "GACA", "GACG", "GAAU", "GAAC", "GAAA", "GAAG", "GAGU", "GAGC", "GAGA", "GAGG", "GGUU", "GGUC", "GGUA", "GGUG", "GGCU", "GGCC", "GGCA", "GGCG", "GGAU", "GGAC", "GGAA", "GGAG", "GGGU", "GGGC", "GGGA", "GGGG"};


#endif
92 changes: 92 additions & 0 deletions include/fasta_to_fastafs.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@

#include <vector>

#include <openssl/md5.h>

#include "config.hpp"
#include "utils.hpp"

#include "fastafs.hpp"
#include "twobit_byte.hpp"
#include "fourbit_byte.hpp"



class fasta_to_fastafs_seq
{
public:

void add_N();

void add_twobit_ACTG(unsigned char, std::ofstream &);//Adds a T or a U
void finish_twobit_sequence(std::ofstream &);

void add_fourbit_ACTG(unsigned char, std::ofstream &);//Adds a T or a U
void finish_fourbit_sequence(std::ofstream &);

off_t file_offset_in_fasta; // file positions in FASTA file where sequence data blocks starts [ACTG]
off_t file_offset_in_fastafs; // file positions in FASTAFS file where sequence data blocks starts [2bit/4bit]
std::string name;

uint32_t N;// number of N (unknown) nucleotides (n - N = total 2bit compressed nucleotides)
uint32_t n_actg;// number of non-N nucleotides (any [ACTGU])

size_t N_bytes_used();// total number of bytes needed to store N's
size_t twobit_bytes_used();// total number of bytes needed to store xBits

bool previous_was_N;




// all below are undefined at initialization
uint32_t padding;

// the followin should be member of a conversion struct, because they're not related to the original 2bit format:
MD5_CTX ctx;
unsigned char md5_digest[MD5_DIGEST_LENGTH];

std::vector<uint32_t> n_block_starts;
std::vector<uint32_t> n_block_ends;

std::vector<uint32_t> m_block_starts;
std::vector<uint32_t> m_block_ends;
bool in_m_block;

char current_dict;


bool has_T;
bool has_U;


twobit_byte twobit_data;
fourbit_byte fourbit_data;


fasta_to_fastafs_seq(off_t fof_fasta, off_t fof_fastafs, const std::string &name):
file_offset_in_fasta(fof_fasta),
file_offset_in_fastafs(fof_fastafs),
name(name),
N(0),
n_actg(0),
previous_was_N(false),
in_m_block(false),
current_dict(DICT_TWOBIT),
has_T(false),
has_U(false),
twobit_data(ENCODE_HASH_TWOBIT_DNA) // not relevant for encoding, only for decoding
{
if(name.size() > 255) {
fprintf(stderr, "[fasta_to_fastafs::init] sequence name truncated to 255 charaters: %s\n", name.c_str());
this->name = this->name.substr (0,255);
}
MD5_Init(&this->ctx);
}

void flush();
};


size_t fasta_to_fastafs(const std::string &, const std::string &, bool auto_recompress_to_fourbit);

62 changes: 0 additions & 62 deletions include/fasta_to_fourbit_fastafs.hpp

This file was deleted.

61 changes: 0 additions & 61 deletions include/fasta_to_twobit_fastafs.hpp

This file was deleted.

26 changes: 25 additions & 1 deletion include/twobit_byte.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@
#include <array>
#include "config.hpp"


class twobit_byte
{
public:
static const char encode_hash[256][5];
const char (&encode_hash)[256][5];
twobit_byte(const char (&encode_hash_arg)[256][5]): encode_hash(encode_hash_arg) {};

static const char n_fill_unmasked = 'N';
static const char n_fill_masked = 'n';

Expand All @@ -24,4 +27,25 @@ class twobit_byte
static unsigned char iterator_to_offset(unsigned int);
};



class twobit_byte_dna : public twobit_byte
{
public:
twobit_byte_dna(): twobit_byte(ENCODE_HASH_TWOBIT_DNA) { }
};



class twobit_byte_rna : public twobit_byte
{
public:
twobit_byte_rna(): twobit_byte(ENCODE_HASH_TWOBIT_RNA) { }
};






#endif
2 changes: 2 additions & 0 deletions include/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,5 @@ bool is_ucsc2bit_file(char *);
std::string basename_cpp(std::string);
std::string realpath_cpp(std::string);

uint32_t file_crc32(const std::string &, off_t, size_t );

Loading

0 comments on commit c2d8438

Please sign in to comment.