Update tools

quadram-institute-bioscience · Sep 22, 2024 · 10d85a0 · 10d85a0
1 parent ae54809
commit 10d85a0
Show file tree

Hide file tree

Showing 5 changed files with 172 additions and 50 deletions.
diff --git a/src/bench_comp_2.py b/src/bench_comp_2.py
@@ -2,7 +2,7 @@
 import csv
 import sys
 from typing import List, NamedTuple
-from collections import defaultdict, Counter
+from collections import defaultdict
 
 class ToolData(NamedTuple):
     command: str
@@ -74,14 +74,17 @@ def rank_tools(data: List[ToolData]) -> List[ToolData]:
     # Sort by mean execution time
     return sorted(averaged_data, key=lambda x: x.mean)
 
-def count_fastest_tools(file_paths: List[str]) -> Counter:
-    fastest_tools = Counter()
+def calculate_tool_scores(file_paths: List[str]) -> dict:
+    tool_scores = defaultdict(int)
     for file_path in file_paths:
         data = parse_csv(file_path)
         if data:
-            fastest_tool = min(data, key=lambda x: x.mean).command
-            fastest_tools[fastest_tool] += 1
-    return fastest_tools
+            sorted_tools = sorted(data, key=lambda x: x.mean)
+            num_tools = len(sorted_tools)
+            for rank, tool in enumerate(sorted_tools, start=1):
+                tool_scores[tool.command] += num_tools - rank
+
+    return tool_scores
 
 def print_ranked_tools(ranked_tools: List[ToolData]):
     print("Ranked list of tools (from fastest to slowest):")
@@ -91,27 +94,29 @@ def print_ranked_tools(ranked_tools: List[ToolData]):
     for i, tool in enumerate(ranked_tools, 1):
         print(f"{i:<5}{tool.command:<20}{tool.mean:<12.6f}{tool.median:<12.6f}{tool.stddev:<12.6f}{tool.min:<12.6f}{tool.max:<12.6f}")
 
-def print_fastest_tool_frequency(fastest_tools: Counter):
-    print("\nRanked list of tools by frequency of being fastest:")
+def print_tool_scores(tool_scores: dict):
+    print("\nRanked list of tools by score (NUM-RANK points):")
     print("-" * 50)
-    print(f"{'Rank':<5}{'Command':<20}{'Frequency':<10}")
+    print(f"{'Rank':<5}{'Command':<20}{'Score':<10}")
     print("-" * 50)
-    for i, (tool, count) in enumerate(fastest_tools.most_common(), 1):
-        print(f"{i:<5}{tool:<20}{count:<10}")
+    sorted_scores = sorted(tool_scores.items(), key=lambda x: x[1], reverse=True)
+    for i, (tool, score) in enumerate(sorted_scores, 1):
+        print(f"{i:<5}{tool:<20}{score:<10}")
 
 def main():
     if len(sys.argv) < 2:
         print("Usage: python script.py <csv_file1> <csv_file2> ...")
         sys.exit(1)
 
     file_paths = sys.argv[1:]
+    total_files = len(file_paths)
     all_data = process_files(file_paths)
     ranked_tools = rank_tools(all_data)
-
+    print(f"Processed {total_files} files with {len(all_data)} data points.")
     print_ranked_tools(ranked_tools)
 
-    fastest_tools = count_fastest_tools(file_paths)
-    print_fastest_tool_frequency(fastest_tools)
+    tool_scores = calculate_tool_scores(file_paths)
+    print_tool_scores(tool_scores)
 
 if __name__ == "__main__":
     main()
diff --git a/src/n50.c b/src/n50.c
@@ -197,6 +197,8 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
     uint64_t *chunk_lengths = NULL;
     int chunk_count = 0;
     int chunk_capacity = INITIAL_CAPACITY;
+
+    // Extra statistics
     uint64_t current_length = 0;
     uint64_t min = UINT64_MAX;
     uint64_t max = 0;
@@ -206,6 +208,14 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
 
     chunk_lengths = malloc(chunk_capacity * sizeof(uint64_t));
 
+    // lookup table
+        // Create a lookup table for base counting
+    uint8_t base_lookup[256] = {0};
+    base_lookup['A'] = base_lookup['a'] = 0;
+    base_lookup['C'] = base_lookup['c'] = 1;
+    base_lookup['G'] = base_lookup['g'] = 2;
+    base_lookup['T'] = base_lookup['t'] = 3;
+
     if (is_fastq) {
 
         #if DEBUG
@@ -223,7 +233,8 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
             if (bytes_read == 0) break;
 
             for (size_t i = 0; i < bytes_read; i++) {
-                if (buffer[i] == '\n') {
+                char c = buffer[i];
+                if (c == '\n') {
                     newline_count++;
                     if (newline_count % 4 == 1) {
                         in_sequence = true;
@@ -234,9 +245,7 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
                             chunk_lengths = realloc(chunk_lengths, chunk_capacity * sizeof(uint64_t));
                         }
                         chunk_lengths[chunk_count++] = current_length;
-                        #if DEBUG
-                            fprintf(stderr, "Debug: Recorded sequence length %lu\n", current_length);
-                        #endif
+
                         if (extra) {
                             if (current_length > max) max = current_length;
                             if (current_length < min) min = current_length;
@@ -246,22 +255,12 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
                 } else if (in_sequence) {
                     current_length++;
                     if (extra) {
-                        char c = toupper(buffer[i]);
-                        switch (c) {
-                            case 'A': bases[0]++; break;
-                            case 'C': bases[1]++; break;
-                            case 'G': bases[2]++; break;
-                            case 'T': bases[3]++; break;
-                            default: bases[4]++;
-                        }
+                        bases[base_lookup[(uint8_t)c]]++;
                     }
                 }
             }
         }
-        #if DEBUG
-            fprintf(stderr, "Debug: End of file reached. Total records processed: %d\n", chunk_count);
-        #endif
-        } else {  // FASTA
+    } else {  // FASTA
         bool in_sequence = false;
         while (1) {
             char *result;
@@ -291,14 +290,7 @@ FileStats process_file(const char *filename, bool force_fasta, bool force_fastq,
                 current_length += len;
                 if (extra) {
                     for (uint64_t i = 0; i < len; i++) {
-                        char c = toupper(buffer[i]);
-                        switch (c) {
-                            case 'A': bases[0]++; break;
-                            case 'C': bases[1]++; break;
-                            case 'G': bases[2]++; break;
-                            case 'T': bases[3]++; break;
-                            default: bases[4]++;
-                        }
+                        bases[base_lookup[(uint8_t)buffer[i]]]++;
                     }
                 }
             }

diff --git a/src/n50_binner.c b/src/n50_binner.c
@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <zlib.h>
+
+#define MAX_LINE 1000000
+#define NUM_BINS 16
+
+// Function to determine the bin for a given length
+int get_bin(int length) {
+    int bins[] = {10, 100, 1000, 2500, 5000, 10000, 20000, 35000, 50000, 75000, 100000, 200000, 300000, 500000, 750000, 1000000};
+    for (int i = 0; i < NUM_BINS; i++) {
+        if (length <= bins[i]) {
+            return i;
+        }
+    }
+    return NUM_BINS - 1;  // For reads longer than the last bin
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 2) {
+        fprintf(stderr, "Usage: %s <fastq_file>\n", argv[0]);
+        return 1;
+    }
+
+    gzFile fp = gzopen(argv[1], "r");
+    if (!fp) {
+        fprintf(stderr, "Error: Could not open file %s\n", argv[1]);
+        return 1;
+    }
+
+    char line[MAX_LINE];
+    int counters[NUM_BINS] = {0};
+    int line_count = 0;
+
+    while (gzgets(fp, line, sizeof(line))) {
+        line_count++;
+        if (line_count % 4 == 2) {  // This is the sequence line
+            int length = strlen(line) - 1;  // Subtract 1 to remove newline
+            int bin = get_bin(length);
+            counters[bin]++;
+        }
+    }
+
+    gzclose(fp);
+
+    // Print results
+    printf("Bin,Number of Reads\n");
+    int bins[] = {10, 100, 1000, 2500, 5000, 10000, 20000, 35000, 50000, 75000, 100000, 200000, 300000, 500000, 750000, 1000000};
+    for (int i = 0; i < NUM_BINS; i++) {
+        printf("%d,%d\n", bins[i], counters[i]);
+    }
+
+    return 0;
+}
diff --git a/src/n50_simreads.c b/src/n50_simreads.c
@@ -8,6 +8,8 @@
 
 #define MAX_ARGS 100 // Maximum number of arguments
 #define MAX_PATH 1024 // Maximum path length
+// make const char bases[] = "ACGTactAC"; // 70% AT, 30% CG as global constant
+const char bases[] = "ACGTactAC"; 
 /*
     n50 suite - simulate reads
     Usage: n50_simreads [--fasta|--fastq] -o OUTDIR ARGS
@@ -21,7 +23,8 @@
 */
 // Function to generate a random DNA sequence
 void generate_sequence(char *seq, int length) {
-    const char bases[] = "ACGTactAC"; // 70% AT, 30% CG
+    // Will receive two parameters, a pointer to a char and an int
+
     for (int i = 0; i < length; i++) {
         seq[i] = bases[rand() % 9];
     }
@@ -83,28 +86,46 @@ int calculate_n50(const int *lengths, int num_seqs, long long *total_length) {
 
 int main(int argc, char *argv[]) {
     if (argc < 5) {
-        fprintf(stderr, "Usage: %s [--fasta|--fastq] -o OUTDIR ARGS\n", argv[0]);
+        fprintf(stderr, "Usage: %s [--fasta|--fastq] -o OUTDIR [-p PREFIX] ARGS\n", argv[0]);
+        fprintf(stderr, "ARGS format: COUNT*SIZE\n");
         return 1;
-    }
+    } else if (argc > MAX_ARGS) {
+        fprintf(stderr, "Too many arguments. Maximum is %d.\n", MAX_ARGS);
+        return 1;
+    } 
 
     int is_fastq = 0;
+    int verbose = 0;
     const char *format = is_fastq ? "FASTQ" : "FASTA";
     char *outdir = NULL;
+    char *prefix = NULL;
 
-    for (int i = 1; i < argc - 1; i++) {
+    for (int i = 1; i < argc; i++) {
+
         if (strcmp(argv[i], "--fastq") == 0) {
             is_fastq = 1;
         } else if (strcmp(argv[i], "--fasta") == 0) {
             is_fastq = 0;
         } else if (strcmp(argv[i], "-o") == 0) {
             outdir = argv[++i];
+        } else if (strcmp(argv[i], "--verbose") == 0) {
+            verbose = 1;
+            fprintf(stderr, "Verbose mode enabled.\n");
+        } else if (strcmp(argv[i], "-p") == 0) {
+            prefix = argv[++i];
+        } else if (!strchr(argv[i], '*')) {
+            fprintf(stderr, "Invalid argument: %s\n", argv[i]);
+            return 1;
         }
     }
 
     if (!outdir) {
         fprintf(stderr, "Output directory not specified. Use -o OUTDIR.\n");
         return 1;
     }
+    if (!prefix) {
+        prefix = "x";
+    }
 
     // Create output directory if it doesn't exist
     struct stat st = {0};
@@ -118,7 +139,11 @@ int main(int argc, char *argv[]) {
     int *lengths = NULL;
     int lengths_capacity = 0;    // Capacity of lengths array
 
-    for (int i = 4; i < argc; i++) {
+    for (int i = 0; i < argc; i++) {
+        // if "*" is not found, continue
+        if (!strchr(argv[i], '*')) {
+            continue;
+        }
         char *arg = argv[i];
         char *count_str = strtok(arg, "*");
         char *size_str = strtok(NULL, "*");
@@ -130,8 +155,10 @@ int main(int argc, char *argv[]) {
 
         int count = atoi(count_str);
         int size = parse_size(size_str);
-        // print to stderr the count and size
-        fprintf(stderr, "# Write seqs: %d, of size: %d\n", count, size);
+
+        if (verbose) {
+            fprintf(stderr, "To do: %d sequences of size %d\n", count, size);
+        }
         // Reallocate lengths array if necessary
         if (total_seqs + count > lengths_capacity) {
             lengths_capacity = total_seqs + count;
@@ -144,17 +171,16 @@ int main(int argc, char *argv[]) {
 
         for (int j = 0; j < count; j++) {
             lengths[total_seqs++] = size;
-            fprintf(stderr, " + Seq %d: %d\n", total_seqs, size);
         }
     }
 
     long long total_length;
     int n50 = calculate_n50(lengths, total_seqs, &total_length);
 
     // print N50, total seqs and total length to STDERR
-    fprintf(stderr, "Format:\t%s\nN50:\t%d\nTot seqs:\t%d\nTot len:\t%lld\n", format, n50, total_seqs, total_length);
+    fprintf(stderr, "\n------\nMode:\t%s\nPrefix:\t%s\nFormat:\t%s\nN50:\t%d\nTot seqs:\t%d\nTot len:\t%lld\n------\n", verbose ? "verbose" : "standard", prefix, format,n50, total_seqs, total_length);
     char filename[MAX_PATH];
-    snprintf(filename, MAX_PATH, "%s/%d_%d_%lld.%s", outdir, n50, total_seqs, total_length, is_fastq ? "fastq" : "fasta");
+    snprintf(filename, MAX_PATH, "%s/%s%d_%d_%lld.%s", outdir, prefix, n50, total_seqs, total_length, is_fastq ? "fastq" : "fasta");
 
     FILE *outfile = fopen(filename, "w");
     if (!outfile) {
@@ -168,9 +194,19 @@ int main(int argc, char *argv[]) {
     if (is_fastq) {
         quality = malloc(n50 + 1);
     }
-
+    int last_length = 0;
     for (int i = 0; i < total_seqs; i++) {
+        // print to stderr the count and size
+        if (lengths[i] != last_length) {
+
+            last_length = lengths[i];
+        }
+
+
         generate_sequence(sequence, lengths[i]);
+        if (verbose && i % 1000 == 0) {
+            fprintf(stderr, " Generating seq #%d (%d bp)\r", i, lengths[i]);
+        }
 
         if (is_fastq) {
             generate_quality(quality, lengths[i]);
@@ -184,7 +220,7 @@ int main(int argc, char *argv[]) {
     free(sequence);
     if (quality) free(quality);
     free(lengths);
-
+    fprintf(stderr, "\n");
     printf("Output written to: %s\n", filename);
 
     return 0;

diff --git a/src/remake.sh b/src/remake.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+# Check if required arguments are provided
+if [ "$#" -lt 3 ]; then
+    echo "Usage: $0 <input_file> <FORMAT> <OUTDIR>"
+    exit 1
+fi
+
+INPUT_FILE=$1
+FORMAT=$2
+OUTDIR=$3
+
+# Process the input file and generate the string
+GENERATED_STRING=$(awk -F',' '
+    NR>1 {
+        if ($1 ~ /^[0-9]+$/ && $2 ~ /^[0-9]+$/ && $2 > 0) {
+            printf "%d*%d ", $2, $1
+        }
+    }
+' "$INPUT_FILE" | sed 's/ $//')
+
+# Run the n50_simreads command with the generated string
+n50_simreads --${FORMAT} -o ${OUTDIR} $GENERATED_STRING
+
+# Print the command that was executed (for verification)
+echo "Executed command: n50_simreads --${FORMAT} -o ${OUTDIR} $GENERATED_STRING"
+
+# Calculate and print total number of reads
+TOTAL_READS=$(awk -F',' 'NR>1 && $2 ~ /^[0-9]+$/ {sum += $2} END {print sum}' "$INPUT_FILE")
+echo "Total number of reads: $TOTAL_READS"
+
+# Find the maximum read length
+MAX_LENGTH=$(awk -F',' 'NR>1 && $1 ~ /^[0-9]+$/ && $2 > 0 {max=$1} END {print max}' "$INPUT_FILE")
+echo "Maximum read length: $MAX_LENGTH"