Skip to content

Commit

Permalink
make batches by number of bases
Browse files Browse the repository at this point in the history
  • Loading branch information
gf777 committed Nov 25, 2024
1 parent 421a324 commit 117c8bd
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 13 deletions.
2 changes: 1 addition & 1 deletion gfalibs
Submodule gfalibs updated 2 files
+2 −4 Makefile
+2 −2 include/struct.h
2 changes: 1 addition & 1 deletion include/reads.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ friend class InReads;

class InReads {

uint32_t batchSize = 10000;
uint32_t batchSize = 1000000; // number of bases processed by a thread

std::vector<Log> logs;

Expand Down
24 changes: 13 additions & 11 deletions src/reads.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ void InReads::load() {

std::string newLine, seqHeader, seqComment, line, bedHeader;
std::size_t numFiles = userInput.inFiles.size();
uint32_t batchN = 0;
uint64_t processedLength = 0;
lg.verbose("Processing " + std::to_string(numFiles) + " files");

const static phmap::flat_hash_map<std::string,int> string_to_case{
Expand Down Expand Up @@ -117,17 +119,17 @@ void InReads::load() {
getline(*stream, *inSequence, '>');
readBatch->sequences.push_back(new Sequence {seqHeader, seqComment, inSequence});
seqPos++;
processedLength += inSequence->size();

if (seqPos % batchSize == 0) {

readBatch->batchN = seqPos/batchSize;
if (processedLength > batchSize) {
readBatch->batchN = ++batchN;
lg.verbose("Processing batch N: " + std::to_string(readBatch->batchN));
appendReads(readBatch);
readBatch = new Sequences;
processedLength = 0;
}
lg.verbose("Individual fasta sequence read: " + seqHeader);
//lg.verbose("Individual fasta sequence read: " + seqHeader);
}

break;
}
case '@': {
Expand All @@ -154,21 +156,21 @@ void InReads::load() {

readBatch->sequences.push_back(new Sequence {seqHeader, seqComment, inSequence, inSequenceQuality});
seqPos++;
processedLength += inSequence->size();

if (seqPos % batchSize == 0) {
readBatch->batchN = seqPos/batchSize;
if (processedLength > batchSize) {
readBatch->batchN = ++batchN;
lg.verbose("Processing batch N: " + std::to_string(readBatch->batchN));
appendReads(readBatch);
readBatch = new Sequences;

processedLength = 0;
}
lg.verbose("Individual fastq sequence read: " + seqHeader);
//lg.verbose("Individual fastq sequence read: " + seqHeader);
}
break;
}
}

readBatch->batchN = seqPos/batchSize + 1;
readBatch->batchN = ++batchN; // process residual reads
lg.verbose("Processing batch N: " + std::to_string(readBatch->batchN));
appendReads(readBatch);
}
Expand Down

0 comments on commit 117c8bd

Please sign in to comment.