Skip to content

Commit

Permalink
Merge pull request #33 from yhoogstrate/sha1_via_cached_reading
Browse files Browse the repository at this point in the history
Sha1 via cached reading
  • Loading branch information
yhoogstrate authored Apr 30, 2019
2 parents 58e3c71 + e86adbe commit c928d17
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 13 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ project(fastafs)
# Do this once in a while - find different bugs
#set(CMAKE_CXX_COMPILER "clang++")

set(PROJECT_VERSION "1.1.0")
set(PROJECT_VERSION "1.1.1")
set(PACKAGE_URL "https://github.com/yhoogstrate/fastafs")
set(PACKAGE_BUGREPORT "${PACKAGE_URL}/issues")

Expand Down
8 changes: 6 additions & 2 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
2019-11-18 Youri Hoogstrate
2019-04-30 Youri Hoogstrate

* Performance update.

2019-04-18 Youri Hoogstrate

* The forwards conversion (fasta to fastafs) code has been improved
and does not require to keep whole sequences in heap.

2019-11-04 Youri Hoogstrate
2019-04-11 Youri Hoogstrate

* The file format has been re-specified, backwards incompatible with
the test code. The fastafs to fasta projection has been enormously
Expand Down
5 changes: 3 additions & 2 deletions include/fastafs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,12 @@ class fastafs_seq

uint32_t fasta_filesize(uint32_t padding);
void view_fasta(ffs2f_init_seq*, std::ifstream *);
//uint32_t view_fasta_chunk_cached( char *, off_t, size_t, std::ifstream *);//@todo order of off_t and size_t needs to be identical to view chunk in fastafs::

// legacy: slow code
uint32_t view_fasta_chunk(uint32_t, char *, off_t, size_t, std::ifstream *);//@todo order of off_t and size_t needs to be identical to view chunk in fastafs::
uint32_t view_fasta_chunk_cached(ffs2f_init_seq*, char *, size_t, off_t, std::ifstream *);

std::string sha1(std::ifstream *);
std::string sha1(ffs2f_init_seq*, std::ifstream *);

uint32_t n_twobits();

Expand Down
18 changes: 12 additions & 6 deletions src/fastafs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ fastafs check short 1.53s user 2.73s system 99% cpu 4.269 total
chunk size 1024:
??
*/
std::string fastafs_seq::sha1(std::ifstream *fh)
std::string fastafs_seq::sha1(ffs2f_init_seq* cache, std::ifstream *fh)
{
const size_t header_offset = this->name.size() + 2;
//const size_t fasta_size = header_offset + this->n; // not size effectively, as terminating newline is skipped..., but length to be read
Expand All @@ -483,15 +483,17 @@ std::string fastafs_seq::sha1(std::ifstream *fh)
unsigned long nbases = 0;

for(unsigned long i = 0; i < n_iterations; i++) {
this->view_fasta_chunk(0, chunk, header_offset + (i * chunksize), chunksize, fh);
//this->view_fasta_chunk(0, chunk, header_offset + (i * chunksize), chunksize, fh);
this->view_fasta_chunk_cached(cache, chunk, chunksize, header_offset + (n_iterations * chunksize), fh);
//printf("[%s] - %i\n", chunk, chunksize);
SHA1_Update(&ctx, chunk, chunksize);

nbases += chunksize;
}

if(remaining_bytes > 0) {
this->view_fasta_chunk(0, chunk, header_offset + (n_iterations * chunksize), remaining_bytes, fh);
//this->view_fasta_chunk(0, chunk, header_offset + (n_iterations * chunksize), remaining_bytes, fh);
this->view_fasta_chunk_cached(cache, chunk, remaining_bytes, header_offset + (n_iterations * chunksize), fh);
SHA1_Update(&ctx, chunk, remaining_bytes);
nbases += remaining_bytes;

Expand Down Expand Up @@ -881,6 +883,7 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi
std::ifstream file (this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
if (file.is_open()) {
char n_seq[4];
ffs2f_init* cache = this->init_ffs2f(0);

pos_limit += 4;// skip this loop after writing first four bytes
while(pos < pos_limit) {
Expand Down Expand Up @@ -1048,7 +1051,8 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi
pos_limit += full_twobits;
while(pos < pos_limit) {
//printf("%i - %i = %i || %i\n",pos_limit,pos, (full_twobits - (pos_limit - pos)) * 4, j);
sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), 4, &file);
//sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), 4, &file);
sequence->view_fasta_chunk_cached(cache->sequences[i], n_seq, 4, sequence->name.size() + 2 + ((full_twobits - (pos_limit - pos)) * 4), &file);
t.set(n_seq);
buffer[written++] = t.data;
pos++;
Expand All @@ -1069,7 +1073,8 @@ uint32_t fastafs::view_ucsc2bit_chunk(char *buffer, size_t buffer_size, off_t fi
if(pos < pos_limit) {
//printf("%i - %i = %i || %i :: %i == %i \n",pos_limit,pos, full_twobits * 4, j, sequence->n - (full_twobits * 4), sequence->n - j);

sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + full_twobits * 4, sequence->n - (full_twobits * 4), &file);
//sequence->view_fasta_chunk(0, n_seq, sequence->name.size() + 2 + full_twobits * 4, sequence->n - (full_twobits * 4), &file);
sequence->view_fasta_chunk_cached(cache->sequences[i], n_seq, sequence->n - (full_twobits * 4), sequence->name.size() + 2 + full_twobits * 4, &file);
t.set(n_seq);

buffer[written++] = t.data;
Expand Down Expand Up @@ -1312,13 +1317,14 @@ int fastafs::check_integrity()
char sha1_hash[41] = "";
sha1_hash[40] = '\0';
std::string old_hash;
ffs2f_init* cache = this->init_ffs2f(0);

std::ifstream file (this->filename.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
if (file.is_open()) {
for(uint32_t i = 0; i < this->data.size(); i++) {
sha1_digest_to_hash(this->data[i]->sha1_digest, sha1_hash);
old_hash = std::string(sha1_hash);
std::string new_hash = this->data[i]->sha1(&file);
std::string new_hash = this->data[i]->sha1(cache->sequences[i], &file);

if(old_hash.compare(new_hash) == 0) {
printf("OK\t%s\n",this->data[i]->name.c_str());
Expand Down
5 changes: 3 additions & 2 deletions test/fastafs/test_fastafs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1)

fastafs fs = fastafs("test");
fs.load(fastafs_file);
ffs2f_init* cache = fs.init_ffs2f(0);

//printf("[%i]\n", fs.data.size());
BOOST_REQUIRE(fs.data.size() > 0);
Expand All @@ -316,9 +317,9 @@ BOOST_AUTO_TEST_CASE(test_fastafs_seq_sha1)
BOOST_REQUIRE(file.is_open());

//printf("[%s]\n", fs.data[0]->sha1(&file).c_str());
fs.data[0]->sha1(&file);
fs.data[0]->sha1(cache->sequences[0], &file);

BOOST_CHECK_EQUAL(fs.data[0]->sha1(&file), "2c0cae1d4e272b3ba63e7dd7e3c0efe62f2aaa2f");
BOOST_CHECK_EQUAL(fs.data[0]->sha1(cache->sequences[0], &file), "2c0cae1d4e272b3ba63e7dd7e3c0efe62f2aaa2f");
}


Expand Down

0 comments on commit c928d17

Please sign in to comment.