From e25f69f2b83f5f4576460c2cefd6fad26877d10c Mon Sep 17 00:00:00 2001
From: "Andrea Telatin, M1" <15690844+telatin@users.noreply.github.com>
Date: Sun, 22 Sep 2024 11:32:10 +0100
Subject: [PATCH] Update test

---
 src/n50_parallel.c | 304 ---------------------------------------------
 src/n50_simreads.c |   2 +-
 test/test.sh       |  46 ++++++-
 3 files changed, 46 insertions(+), 306 deletions(-)
 delete mode 100644 src/n50_parallel.c

diff --git a/src/n50_parallel.c b/src/n50_parallel.c
deleted file mode 100644
index 9e41d0e..0000000
--- a/src/n50_parallel.c
+++ /dev/null
@@ -1,304 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <pthread.h>
-#include <ctype.h>
-#include <getopt.h>
-#include <stdbool.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/stat.h>
-
-
-#define MAX_THREADS 16
-#define INITIAL_CAPACITY 1000000
-#define VERSION "2.0.0"
-
-typedef struct {
-    char *filename;
-    long long total_length;
-    int length_count;
-    int n50;
-    bool is_fastq;
-} FileStats;
-
-typedef struct {
-    char *start;
-    size_t size;
-    int *lengths;
-    int capacity;
-    int count;
-    long long total_length;
-    bool is_fastq;
-} ThreadData;
-
-void print_usage(const char *program_name) {
-    printf("Usage: %s [OPTIONS] [FILENAME...]\n", program_name);
-    printf("\nOptions:\n");
-    printf("  -a, --fasta        Force FASTA input format\n");
-    printf("  -q, --fastq        Force FASTQ input format\n");
-    printf("  -h, --header       Print header in output\n");
-    printf("  -n, --n50          Output only N50 value\n");
-    printf("      --help         Display this help message and exit\n");
-    printf("      --version      Display version information and exit\n");
-    printf("\nDescription:\n");
-    printf("  Calculate N50 and other sequence statistics from FASTA or FASTQ files.\n");
-    printf("  Supports multiple input files and automatic format detection.\n");
-}
-
-void print_version() {
-    printf("N50 Calculator version %s\n", VERSION);
-    printf("Copyright (C) 2024 Andrea Telatin\n");
-    printf("License: MIT\n");
-}
-
-int compare(const void *a, const void *b) {
-    return (*(int*)b - *(int*)a);
-}
-
-void *parallel_read_and_parse(void *arg) {
-    ThreadData *data = (ThreadData *)arg;
-    char *current = data->start;
-    char *end = data->start + data->size;
-    int current_length = 0;
-
-    while (current < end) {
-        if (data->is_fastq) {
-            // FASTQ parsing
-            if (*current == '@') {
-                while (current < end && *current != '\n') current++;
-                current++;  // Skip newline
-                char *seq_start = current;
-                while (current < end && *current != '\n') current++;
-                int seq_length = current - seq_start;
-                
-                if (data->count == data->capacity) {
-                    data->capacity *= 2;
-                    data->lengths = realloc(data->lengths, data->capacity * sizeof(int));
-                }
-                data->lengths[data->count++] = seq_length;
-                data->total_length += seq_length;
-
-                // Skip quality line
-                current += 2;  // Skip '+' and newline
-                while (current < end && *current != '\n') current++;
-                current++;  // Skip newline
-            } else {
-                current++;
-            }
-        } else {
-            // FASTA parsing
-            if (*current == '>') {
-                if (current_length > 0) {
-                    if (data->count == data->capacity) {
-                        data->capacity *= 2;
-                        data->lengths = realloc(data->lengths, data->capacity * sizeof(int));
-                    }
-                    data->lengths[data->count++] = current_length;
-                    data->total_length += current_length;
-                    current_length = 0;
-                }
-                while (current < end && *current != '\n') current++;
-                current++;  // Skip newline
-            } else {
-                current_length += (*current >= 'A' && *current <= 'Z') || (*current >= 'a' && *current <= 'z');
-                current++;
-            }
-        }
-    }
-
-    // Handle last sequence for FASTA
-    if (!data->is_fastq && current_length > 0) {
-        if (data->count == data->capacity) {
-            data->capacity *= 2;
-            data->lengths = realloc(data->lengths, data->capacity * sizeof(int));
-        }
-        data->lengths[data->count++] = current_length;
-        data->total_length += current_length;
-    }
-
-    return NULL;
-}
-
-void parallel_merge_sort(int *arr, int *temp, int left, int right) {
-    if (left < right) {
-        int mid = left + (right - left) / 2;
-        #pragma omp task
-        parallel_merge_sort(arr, temp, left, mid);
-        #pragma omp task
-        parallel_merge_sort(arr, temp, mid + 1, right);
-        #pragma omp taskwait
-
-        // Merge
-        int i = left, j = mid + 1, k = left;
-        while (i <= mid && j <= right) {
-            if (arr[i] >= arr[j]) {
-                temp[k++] = arr[i++];
-            } else {
-                temp[k++] = arr[j++];
-            }
-        }
-        while (i <= mid) temp[k++] = arr[i++];
-        while (j <= right) temp[k++] = arr[j++];
-        for (i = left; i <= right; i++) arr[i] = temp[i];
-    }
-}
-
-FileStats process_file(const char *filename, bool force_fasta, bool force_fastq) {
-    int fd = open(filename, O_RDONLY);
-    if (fd == -1) {
-        fprintf(stderr, "Error: Cannot open file %s\n", filename);
-        exit(1);
-    }
-
-    off_t file_size = lseek(fd, 0, SEEK_END);
-    char *file_content = mmap(NULL, file_size, PROT_READ, MAP_PRIVATE, fd, 0);
-    if (file_content == MAP_FAILED) {
-        fprintf(stderr, "Error: Cannot mmap file %s\n", filename);
-        close(fd);
-        exit(1);
-    }
-
-    bool is_fastq = force_fastq || (!force_fasta && (strstr(filename, ".fastq") != NULL || strstr(filename, ".fq") != NULL));
-    int num_threads = MAX_THREADS;
-    size_t chunk_size = file_size / num_threads;
-
-    ThreadData thread_data[MAX_THREADS];
-    pthread_t threads[MAX_THREADS];
-
-    for (int i = 0; i < num_threads; i++) {
-        thread_data[i].start = file_content + i * chunk_size;
-        thread_data[i].size = (i == num_threads - 1) ? file_size - i * chunk_size : chunk_size;
-        thread_data[i].lengths = malloc(INITIAL_CAPACITY * sizeof(int));
-        thread_data[i].capacity = INITIAL_CAPACITY;
-        thread_data[i].count = 0;
-        thread_data[i].total_length = 0;
-        thread_data[i].is_fastq = is_fastq;
-        pthread_create(&threads[i], NULL, parallel_read_and_parse, &thread_data[i]);
-    }
-
-    long long total_length = 0;
-    int total_count = 0;
-    for (int i = 0; i < num_threads; i++) {
-        pthread_join(threads[i], NULL);
-        total_length += thread_data[i].total_length;
-        total_count += thread_data[i].count;
-    }
-
-    // Merge sorted arrays
-    int *merged_lengths = malloc(total_count * sizeof(int));
-    int *temp = malloc(total_count * sizeof(int));
-    int index = 0;
-    for (int i = 0; i < num_threads; i++) {
-        memcpy(merged_lengths + index, thread_data[i].lengths, thread_data[i].count * sizeof(int));
-        index += thread_data[i].count;
-        free(thread_data[i].lengths);
-    }
-
-    // Parallel sort
-    #pragma omp parallel
-    {
-        #pragma omp single
-        parallel_merge_sort(merged_lengths, temp, 0, total_count - 1);
-    }
-
-    free(temp);
-
-    // Calculate N50
-    long long cumulative_length = 0;
-    long long half_total_length = total_length / 2;
-    int n50 = 0;
-    #pragma omp parallel for reduction(+:cumulative_length) shared(n50)
-    for (int i = 0; i < total_count; i++) {
-        cumulative_length += merged_lengths[i];
-        if (cumulative_length >= half_total_length && n50 == 0) {
-            #pragma omp critical
-            {
-                if (n50 == 0) n50 = merged_lengths[i];
-            }
-        }
-    }
-
-    free(merged_lengths);
-    munmap(file_content, file_size);
-    close(fd);
-
-    FileStats stats = {
-        .filename = strdup(filename),
-        .total_length = total_length,
-        .length_count = total_count,
-        .n50 = n50,
-        .is_fastq = is_fastq
-    };
-
-    return stats;
-}
-
-int main(int argc, char *argv[]) {
-    bool opt_header = false;
-    bool opt_n50 = false;
-    bool force_fasta = false;
-    bool force_fastq = false;
-
-    static struct option long_options[] = {
-        {"fasta", no_argument, 0, 'a'},
-        {"fastq", no_argument, 0, 'q'},
-        {"header", no_argument, 0, 'H'},
-        {"n50", no_argument, 0, 'n'},
-        {"help", no_argument, 0, 'h'},
-        {"version", no_argument, 0, 'v'},
-        {0, 0, 0, 0}
-    };
-
-    int opt;
-    while ((opt = getopt_long(argc, argv, "aqhnHv", long_options, NULL)) != -1) {
-        switch (opt) {
-            case 'a':
-                force_fasta = true;
-                break;
-            case 'q':
-                force_fastq = true;
-                break;
-            case 'H':
-                opt_header = true;
-                break;
-            case 'n':
-                opt_n50 = true;
-                break;
-            case 'h':
-                print_usage(argv[0]);
-                return 0;
-            case 'v':
-                print_version();
-                return 0;
-            default:
-                fprintf(stderr, "Try '%s --help' for more information.\n", argv[0]);
-                return 1;
-        }
-    }
-
-    if (optind == argc) {
-        fprintf(stderr, "Error: No input files specified\n");
-        fprintf(stderr, "Try '%s --help' for more information.\n", argv[0]);
-        return 1;
-    }
-
-    if (opt_header && !opt_n50) {
-        printf("Filename\tFormat\tTotal_Length\tTotal_Sequences\tN50\n");
-    }
-
-    for (int i = optind; i < argc; i++) {
-        FileStats stats = process_file(argv[i], force_fasta, force_fastq);
-        
-        if (opt_n50) {
-            printf("%s\t%d\n", stats.filename, stats.n50);
-        } else {
-            printf("%s\t%s\t%lld\t%d\t%d\n", stats.filename, stats.is_fastq ? "FASTQ" : "FASTA", stats.total_length, stats.length_count, stats.n50);
-        }
-
-        free(stats.filename);
-    }
-
-    return 0;
-}
\ No newline at end of file
diff --git a/src/n50_simreads.c b/src/n50_simreads.c
index 5519538..57460d9 100644
--- a/src/n50_simreads.c
+++ b/src/n50_simreads.c
@@ -124,7 +124,7 @@ int main(int argc, char *argv[]) {
         return 1;
     }
     if (!prefix) {
-        prefix = "x";
+        prefix = "";
     }
 
     // Create output directory if it doesn't exist
diff --git a/test/test.sh b/test/test.sh
index dc1a8b9..ea27c4b 100644
--- a/test/test.sh
+++ b/test/test.sh
@@ -4,7 +4,7 @@ SELF_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 PARENT_DIR="$(dirname "$SELF_DIR")"
 BIN_DIR="$PARENT_DIR/bin"
 # Check binary presence
-for BIN in n50 n50_simreads gen;
+for BIN in n50 n50_simreads gen n50_binner;
 do
     if [ ! -f "$BIN_DIR/$BIN" ]; then
         echo "Binary not found: $BIN"
@@ -86,7 +86,51 @@ do
     echo ""
 done
 
+
+# Test n50_binner
+RAND_TEMP_DIR=$(mktemp -d)
+echo "TESTING BINNER"
+echo "Random temp dir: $RAND_TEMP_DIR"
+OUTPUT=$("${BIN_DIR}"/n50_simreads --fastq -o "$RAND_TEMP_DIR" 100*250 20*1k 1*50k 2> /dev/null | cut -f 2 -d ":" | sed 's/ //g')
+
+
+"${BIN_DIR}"/n50_binner "$OUTPUT" > "$RAND_TEMP_DIR"/binned.txt
+
+# Check if all lines in the output contains 1 and only 1 ","
+if [ "$(grep -c "," "$RAND_TEMP_DIR"/binned.txt)" -ne "$(wc -l < "$RAND_TEMP_DIR"/binned.txt)" ]; then
+    echo "ERROR: n50_binner output is not in the expected format"
+    exit 1
+else
+    echo "OK: n50_binner output is in the expected format"
+fi 
+
+# Check that the output file contains 1 line with 1 as word (-w), and 1 line with 120 as a word
+if [ "$(grep -c -w "1" "$RAND_TEMP_DIR"/binned.txt)" -ne 1 ]; then
+    echo "ERROR: n50_binner output does not contain 1"
+    exit 1
+else
+    echo "OK: n50_binner output contains 1"
+fi
+
+if [ "$(grep -c -w "120" "$RAND_TEMP_DIR"/binned.txt)" -ne 1 ]; then
+    echo "ERROR: n50_binner output does not contain 120"
+    exit 1
+else
+    echo "OK: n50_binner output contains 120"
+fi
+
+rm -rf "$RAND_TEMP_DIR"
+# Check 
+# --------------------------------------------------------------------------------
+# Hyperfine section
+# --------------------------------------------------------------------------------
+
 # if hyperfine and seqkit and seqfu are available go on
+# if NO_HYPERFINE is defined, skip
+if [ -n "$NO_HYPERFINE" ]; then
+    echo "INFO Skipping benchmar NO_HYPERFINE is defined"
+    exit 0
+fi
 if [ -z "$(which hyperfine)" ] || [ -z "$(which seqkit)" ] || [ -z "$(which seqfu)" ]; then
     echo "Skipping benchmark"
     exit 0