Skip to content

Commit

Permalink
Update tools
Browse files Browse the repository at this point in the history
  • Loading branch information
telatin committed Sep 22, 2024
1 parent ae54809 commit 10d85a0
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 50 deletions.
33 changes: 19 additions & 14 deletions src/bench_comp_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import csv
import sys
from typing import List, NamedTuple
from collections import defaultdict, Counter
from collections import defaultdict

class ToolData(NamedTuple):
command: str
Expand Down Expand Up @@ -74,14 +74,17 @@ def rank_tools(data: List[ToolData]) -> List[ToolData]:
# Sort by mean execution time
return sorted(averaged_data, key=lambda x: x.mean)

def count_fastest_tools(file_paths: List[str]) -> Counter:
fastest_tools = Counter()
def calculate_tool_scores(file_paths: List[str]) -> dict:
tool_scores = defaultdict(int)
for file_path in file_paths:
data = parse_csv(file_path)
if data:
fastest_tool = min(data, key=lambda x: x.mean).command
fastest_tools[fastest_tool] += 1
return fastest_tools
sorted_tools = sorted(data, key=lambda x: x.mean)
num_tools = len(sorted_tools)
for rank, tool in enumerate(sorted_tools, start=1):
tool_scores[tool.command] += num_tools - rank

return tool_scores

def print_ranked_tools(ranked_tools: List[ToolData]):
print("Ranked list of tools (from fastest to slowest):")
Expand All @@ -91,27 +94,29 @@ def print_ranked_tools(ranked_tools: List[ToolData]):
for i, tool in enumerate(ranked_tools, 1):
print(f"{i:<5}{tool.command:<20}{tool.mean:<12.6f}{tool.median:<12.6f}{tool.stddev:<12.6f}{tool.min:<12.6f}{tool.max:<12.6f}")

def print_fastest_tool_frequency(fastest_tools: Counter):
print("\nRanked list of tools by frequency of being fastest:")
def print_tool_scores(tool_scores: dict):
print("\nRanked list of tools by score (NUM-RANK points):")
print("-" * 50)
print(f"{'Rank':<5}{'Command':<20}{'Frequency':<10}")
print(f"{'Rank':<5}{'Command':<20}{'Score':<10}")
print("-" * 50)
for i, (tool, count) in enumerate(fastest_tools.most_common(), 1):
print(f"{i:<5}{tool:<20}{count:<10}")
sorted_scores = sorted(tool_scores.items(), key=lambda x: x[1], reverse=True)
for i, (tool, score) in enumerate(sorted_scores, 1):
print(f"{i:<5}{tool:<20}{score:<10}")

def main():
if len(sys.argv) < 2:
print("Usage: python script.py <csv_file1> <csv_file2> ...")
sys.exit(1)

file_paths = sys.argv[1:]
total_files = len(file_paths)
all_data = process_files(file_paths)
ranked_tools = rank_tools(all_data)

print(f"Processed {total_files} files with {len(all_data)} data points.")
print_ranked_tools(ranked_tools)

fastest_tools = count_fastest_tools(file_paths)
print_fastest_tool_frequency(fastest_tools)
tool_scores = calculate_tool_scores(file_paths)
print_tool_scores(tool_scores)

if __name__ == "__main__":
main()
40 changes: 16 additions & 24 deletions src/n50.c
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
uint64_t *chunk_lengths = NULL;
int chunk_count = 0;
int chunk_capacity = INITIAL_CAPACITY;

// Extra statistics
uint64_t current_length = 0;
uint64_t min = UINT64_MAX;
uint64_t max = 0;
Expand All @@ -206,6 +208,14 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,

chunk_lengths = malloc(chunk_capacity * sizeof(uint64_t));

// lookup table
// Create a lookup table for base counting
uint8_t base_lookup[256] = {0};
base_lookup['A'] = base_lookup['a'] = 0;
base_lookup['C'] = base_lookup['c'] = 1;
base_lookup['G'] = base_lookup['g'] = 2;
base_lookup['T'] = base_lookup['t'] = 3;

if (is_fastq) {

#if DEBUG
Expand All @@ -223,7 +233,8 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
if (bytes_read == 0) break;

for (size_t i = 0; i < bytes_read; i++) {
if (buffer[i] == '\n') {
char c = buffer[i];
if (c == '\n') {
newline_count++;
if (newline_count % 4 == 1) {
in_sequence = true;
Expand All @@ -234,9 +245,7 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
chunk_lengths = realloc(chunk_lengths, chunk_capacity * sizeof(uint64_t));
}
chunk_lengths[chunk_count++] = current_length;
#if DEBUG
fprintf(stderr, "Debug: Recorded sequence length %lu\n", current_length);
#endif

if (extra) {
if (current_length > max) max = current_length;
if (current_length < min) min = current_length;
Expand All @@ -246,22 +255,12 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
} else if (in_sequence) {
current_length++;
if (extra) {
char c = toupper(buffer[i]);
switch (c) {
case 'A': bases[0]++; break;
case 'C': bases[1]++; break;
case 'G': bases[2]++; break;
case 'T': bases[3]++; break;
default: bases[4]++;
}
bases[base_lookup[(uint8_t)c]]++;
}
}
}
}
#if DEBUG
fprintf(stderr, "Debug: End of file reached. Total records processed: %d\n", chunk_count);
#endif
} else { // FASTA
} else { // FASTA
bool in_sequence = false;
while (1) {
char *result;
Expand Down Expand Up @@ -291,14 +290,7 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
current_length += len;
if (extra) {
for (uint64_t i = 0; i < len; i++) {
char c = toupper(buffer[i]);
switch (c) {
case 'A': bases[0]++; break;
case 'C': bases[1]++; break;
case 'G': bases[2]++; break;
case 'T': bases[3]++; break;
default: bases[4]++;
}
bases[base_lookup[(uint8_t)buffer[i]]]++;
}
}
}
Expand Down
55 changes: 55 additions & 0 deletions src/n50_binner.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <zlib.h>

#define MAX_LINE 1000000
#define NUM_BINS 16

// Function to determine the bin for a given length
int get_bin(int length) {
int bins[] = {10, 100, 1000, 2500, 5000, 10000, 20000, 35000, 50000, 75000, 100000, 200000, 300000, 500000, 750000, 1000000};
for (int i = 0; i < NUM_BINS; i++) {
if (length <= bins[i]) {
return i;
}
}
return NUM_BINS - 1; // For reads longer than the last bin
}

int main(int argc, char *argv[]) {
if (argc != 2) {
fprintf(stderr, "Usage: %s <fastq_file>\n", argv[0]);
return 1;
}

gzFile fp = gzopen(argv[1], "r");
if (!fp) {
fprintf(stderr, "Error: Could not open file %s\n", argv[1]);
return 1;
}

char line[MAX_LINE];
int counters[NUM_BINS] = {0};
int line_count = 0;

while (gzgets(fp, line, sizeof(line))) {
line_count++;
if (line_count % 4 == 2) { // This is the sequence line
int length = strlen(line) - 1; // Subtract 1 to remove newline
int bin = get_bin(length);
counters[bin]++;
}
}

gzclose(fp);

// Print results
printf("Bin,Number of Reads\n");
int bins[] = {10, 100, 1000, 2500, 5000, 10000, 20000, 35000, 50000, 75000, 100000, 200000, 300000, 500000, 750000, 1000000};
for (int i = 0; i < NUM_BINS; i++) {
printf("%d,%d\n", bins[i], counters[i]);
}

return 0;
}
60 changes: 48 additions & 12 deletions src/n50_simreads.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#define MAX_ARGS 100 // Maximum number of arguments
#define MAX_PATH 1024 // Maximum path length
// make const char bases[] = "ACGTactAC"; // 70% AT, 30% CG as global constant
const char bases[] = "ACGTactAC";
/*
n50 suite - simulate reads
Usage: n50_simreads [--fasta|--fastq] -o OUTDIR ARGS
Expand All @@ -21,7 +23,8 @@
*/
// Function to generate a random DNA sequence
void generate_sequence(char *seq, int length) {
const char bases[] = "ACGTactAC"; // 70% AT, 30% CG
// Will receive two parameters, a pointer to a char and an int

for (int i = 0; i < length; i++) {
seq[i] = bases[rand() % 9];
}
Expand Down Expand Up @@ -83,28 +86,46 @@ int calculate_n50(const int *lengths, int num_seqs, long long *total_length) {

int main(int argc, char *argv[]) {
if (argc < 5) {
fprintf(stderr, "Usage: %s [--fasta|--fastq] -o OUTDIR ARGS\n", argv[0]);
fprintf(stderr, "Usage: %s [--fasta|--fastq] -o OUTDIR [-p PREFIX] ARGS\n", argv[0]);
fprintf(stderr, "ARGS format: COUNT*SIZE\n");
return 1;
}
} else if (argc > MAX_ARGS) {
fprintf(stderr, "Too many arguments. Maximum is %d.\n", MAX_ARGS);
return 1;
}

int is_fastq = 0;
int verbose = 0;
const char *format = is_fastq ? "FASTQ" : "FASTA";
char *outdir = NULL;
char *prefix = NULL;

for (int i = 1; i < argc - 1; i++) {
for (int i = 1; i < argc; i++) {

if (strcmp(argv[i], "--fastq") == 0) {
is_fastq = 1;
} else if (strcmp(argv[i], "--fasta") == 0) {
is_fastq = 0;
} else if (strcmp(argv[i], "-o") == 0) {
outdir = argv[++i];
} else if (strcmp(argv[i], "--verbose") == 0) {
verbose = 1;
fprintf(stderr, "Verbose mode enabled.\n");
} else if (strcmp(argv[i], "-p") == 0) {
prefix = argv[++i];
} else if (!strchr(argv[i], '*')) {
fprintf(stderr, "Invalid argument: %s\n", argv[i]);
return 1;
}
}

if (!outdir) {
fprintf(stderr, "Output directory not specified. Use -o OUTDIR.\n");
return 1;
}
if (!prefix) {
prefix = "x";
}

// Create output directory if it doesn't exist
struct stat st = {0};
Expand All @@ -118,7 +139,11 @@ int main(int argc, char *argv[]) {
int *lengths = NULL;
int lengths_capacity = 0; // Capacity of lengths array

for (int i = 4; i < argc; i++) {
for (int i = 0; i < argc; i++) {
// if "*" is not found, continue
if (!strchr(argv[i], '*')) {
continue;
}
char *arg = argv[i];
char *count_str = strtok(arg, "*");
char *size_str = strtok(NULL, "*");
Expand All @@ -130,8 +155,10 @@ int main(int argc, char *argv[]) {

int count = atoi(count_str);
int size = parse_size(size_str);
// print to stderr the count and size
fprintf(stderr, "# Write seqs: %d, of size: %d\n", count, size);

if (verbose) {
fprintf(stderr, "To do: %d sequences of size %d\n", count, size);
}
// Reallocate lengths array if necessary
if (total_seqs + count > lengths_capacity) {
lengths_capacity = total_seqs + count;
Expand All @@ -144,17 +171,16 @@ int main(int argc, char *argv[]) {

for (int j = 0; j < count; j++) {
lengths[total_seqs++] = size;
fprintf(stderr, " + Seq %d: %d\n", total_seqs, size);
}
}

long long total_length;
int n50 = calculate_n50(lengths, total_seqs, &total_length);

// print N50, total seqs and total length to STDERR
fprintf(stderr, "Format:\t%s\nN50:\t%d\nTot seqs:\t%d\nTot len:\t%lld\n", format, n50, total_seqs, total_length);
fprintf(stderr, "\n------\nMode:\t%s\nPrefix:\t%s\nFormat:\t%s\nN50:\t%d\nTot seqs:\t%d\nTot len:\t%lld\n------\n", verbose ? "verbose" : "standard", prefix, format,n50, total_seqs, total_length);
char filename[MAX_PATH];
snprintf(filename, MAX_PATH, "%s/%d_%d_%lld.%s", outdir, n50, total_seqs, total_length, is_fastq ? "fastq" : "fasta");
snprintf(filename, MAX_PATH, "%s/%s%d_%d_%lld.%s", outdir, prefix, n50, total_seqs, total_length, is_fastq ? "fastq" : "fasta");

FILE *outfile = fopen(filename, "w");
if (!outfile) {
Expand All @@ -168,9 +194,19 @@ int main(int argc, char *argv[]) {
if (is_fastq) {
quality = malloc(n50 + 1);
}

int last_length = 0;
for (int i = 0; i < total_seqs; i++) {
// print to stderr the count and size
if (lengths[i] != last_length) {

last_length = lengths[i];
}


generate_sequence(sequence, lengths[i]);
if (verbose && i % 1000 == 0) {
fprintf(stderr, " Generating seq #%d (%d bp)\r", i, lengths[i]);
}

if (is_fastq) {
generate_quality(quality, lengths[i]);
Expand All @@ -184,7 +220,7 @@ int main(int argc, char *argv[]) {
free(sequence);
if (quality) free(quality);
free(lengths);

fprintf(stderr, "\n");
printf("Output written to: %s\n", filename);

return 0;
Expand Down
34 changes: 34 additions & 0 deletions src/remake.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

# Check if required arguments are provided
if [ "$#" -lt 3 ]; then
echo "Usage: $0 <input_file> <FORMAT> <OUTDIR>"
exit 1
fi

INPUT_FILE=$1
FORMAT=$2
OUTDIR=$3

# Process the input file and generate the string
GENERATED_STRING=$(awk -F',' '
NR>1 {
if ($1 ~ /^[0-9]+$/ && $2 ~ /^[0-9]+$/ && $2 > 0) {
printf "%d*%d ", $2, $1
}
}
' "$INPUT_FILE" | sed 's/ $//')

# Run the n50_simreads command with the generated string
n50_simreads --${FORMAT} -o ${OUTDIR} $GENERATED_STRING

# Print the command that was executed (for verification)
echo "Executed command: n50_simreads --${FORMAT} -o ${OUTDIR} $GENERATED_STRING"

# Calculate and print total number of reads
TOTAL_READS=$(awk -F',' 'NR>1 && $2 ~ /^[0-9]+$/ {sum += $2} END {print sum}' "$INPUT_FILE")
echo "Total number of reads: $TOTAL_READS"

# Find the maximum read length
MAX_LENGTH=$(awk -F',' 'NR>1 && $1 ~ /^[0-9]+$/ && $2 > 0 {max=$1} END {print max}' "$INPUT_FILE")
echo "Maximum read length: $MAX_LENGTH"

0 comments on commit 10d85a0

Please sign in to comment.