From 1f43377783a76ef10962ee0dbe679055d5f3f1ad Mon Sep 17 00:00:00 2001 From: "Andrea Telatin, M1" <15690844+telatin@users.noreply.github.com> Date: Sun, 22 Sep 2024 21:57:55 +0100 Subject: [PATCH] add NO_HYPERFINE --- README_BENCHMARK.md | 0 README_GEN.md | 111 --------- README_N50.md | 102 --------- src/gen-thread.c | 317 -------------------------- src/{test.py => generator_test.py} | 28 ++- src/n50.bak | 352 ----------------------------- src/n50_binner.c | 2 +- src/n50_generate.c | 211 +++++++++++++++++ src/n50_simreads.bak | 91 -------- src/n60.c | 226 ------------------ test/dirbench.py | 82 +++++++ test/test.sh | 10 +- 12 files changed, 325 insertions(+), 1207 deletions(-) delete mode 100644 README_BENCHMARK.md delete mode 100644 README_GEN.md delete mode 100644 README_N50.md delete mode 100644 src/gen-thread.c rename src/{test.py => generator_test.py} (80%) delete mode 100644 src/n50.bak create mode 100644 src/n50_generate.c delete mode 100644 src/n50_simreads.bak delete mode 100644 src/n60.c create mode 100755 test/dirbench.py diff --git a/README_BENCHMARK.md b/README_BENCHMARK.md deleted file mode 100644 index e69de29..0000000 diff --git a/README_GEN.md b/README_GEN.md deleted file mode 100644 index b94b55d..0000000 --- a/README_GEN.md +++ /dev/null @@ -1,111 +0,0 @@ -# `gen`: DNA Sequence Generator - -## Overview - -The DNA Sequence Generator is a C program designed to create random DNA sequences in FASTA or FASTQ format. It generates multiple files containing sequences of varying lengths, simulating genomic data for testing purposes. This tool is particularly useful for: - -- Testing sequence analysis software -- Benchmarking bioinformatics pipelines -- Generating sample data for educational purposes - -## Features - -- Calculates and includes N50 statistics in the output filenames -- Supports both FASTA and FASTQ output formats -- Deterministic output based on a provided seed for reproducibility -- Output filename format: `N50_TOTSEQS_SUMLEN.{fasta|fastq}` to make easy to test the N50 calculation - -## Compilation - -To compile the program, use a C compiler such as GCC: - -```bash -gcc -o dna_generator dna_generator.c -lm -``` - -## Usage - -```bash -./gen -``` - -### Parameters - -- ``: Minimum number of sequences per file -- ``: Maximum number of sequences per file -- ``: Minimum length of each sequence -- ``: Maximum length of each sequence -- ``: Total number of files to generate -- ``: Output format (either "fasta" or "fastq") -- ``: Directory to store the output files -- ``: Seed for the random number generator (for reproducibility) - -### Example - -```bash -gen 10 100 1000 10000 5 fasta output_dir -``` - -This command will generate 5 FASTA files in the `output_dir` directory. Each file will contain between 10 and 100 sequences, with lengths ranging from 1000 to 10000 base pairs. The random number generator will be initialized with a static seed to ensure reproducibility. - -## Output - -The program generates files with names in the format: - -``` -N50_TOTSEQS_SUMLEN.{fasta|fastq} -``` - -Where: -- `N50` is the N50 statistic of the sequences in the file -- `TOTSEQS` is the total number of sequences in the file -- `SUMLEN` is the sum of all sequence lengths in the file - -### FASTA Format - -In FASTA format, each sequence is represented as: - -``` ->seq1 -ATCGATCGATCG... ->seq2 -GCTAGCTAGCTA... -``` - -### FASTQ Format - -In FASTQ format, each sequence is represented as: - -```text -@seq1 -ATCGATCGATCG... -+ -IIIIIIIIIIII... -@seq2 -GCTAGCTAGCTA... -+ -IIIIIIIIIIII... -``` - -Note: The quality scores in FASTQ format are dummy values (all 'I') for simplicity. - -## Key Functions - -- `calculate_n50`: Calculates the N50 statistic for a set of sequences -- `generate_sequence`: Generates a random DNA sequence of a given length -- `write_fasta`: Writes sequences to a file in FASTA format -- `write_fastq`: Writes sequences to a file in FASTQ format - -## Limitations and Future Improvements - -- Could be extended to support more realistic DNA models (using existing distributions from real datasets) -## License - -This program is provided under the MIT License. See the source code for full license text. - -## Author - -Andrea Telatin, 2023 -Quadram Institute Bioscience - - \ No newline at end of file diff --git a/README_N50.md b/README_N50.md deleted file mode 100644 index 52500b4..0000000 --- a/README_N50.md +++ /dev/null @@ -1,102 +0,0 @@ -# `n50` - Calculate N50 - -> A fast and efficient tool for calculating N50 and other sequence statistics from FASTA and FASTQ files. - -## Features - - -- Supports both FASTA and FASTQ formats -- Optimized for FASTQ raw file (Nanopore, PacBio) -- Handles gzipped input files - -## Installation - -### Prerequisites - -- GCC compiler -- zlib library -- pthread library - -### Compiling - -To compile the program, use the following command: - -```bash -make -``` - -or compile binaries like: - -```bash -gcc -o n50 src/n50.c -lz -lpthread -O3 -``` - -## Usage - -```bash -./n50 [options] [filename]... -``` - -If no filename is provided, the program reads from standard input. - -### Options - -- `--fasta` or `-a`: Force FASTA input format -- `--fastq` or `-q`: Force FASTQ input format -- `--header` or `-H`: Print header in the output -- `--n50` or `-n`: Output only the N50 value -- `--version` and `--help` to display version and help information respectively - -### Examples - -1. Process a FASTA file: - -```bash -./n50 input.fasta -``` - -2. Process a gzipped FASTQ file: - -```bash -./n50 input.fastq.gz -``` - -3. Process a file with header output: - -```bash -./n50 --header input.fasta -``` - -4. Output only the N50 value: - -```bash -./n50 --n50 input.fasta -``` - -5. Process input from stdin: - -```bash -cat input.fasta | ./n50 --format fasta -``` - -## Output - -By default, the program outputs a tab-separated line with the following fields: - -1. Format (FASTA or FASTQ) -2. Total sequence length -3. Total number of sequences -4. N50 value - -When using the `--header` option, a header line is printed before the results. - -When using the `--n50` option, only the N50 value is printed. - -## Performance - -The program uses multi-threading to process large files efficiently. It automatically adjusts the number of threads based on the input size, up to a maximum of 8 threads. - -## Limitations - -- The maximum number of threads is currently set to 8. This can be adjusted by modifying the `MAX_THREADS` constant in the source code. -- The initial capacity for storing sequence lengths is set to 1,000,000. For extremely large datasets, this value might need to be increased. diff --git a/src/gen-thread.c b/src/gen-thread.c deleted file mode 100644 index acddc61..0000000 --- a/src/gen-thread.c +++ /dev/null @@ -1,317 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#define MAX_SEQ_NAME_LEN 256 -#define MAX_FILENAME_LEN 256 -#define MAX_THREADS 16 -#define CHUNK_SIZE 1000 // Number of sequences to process in each chunk - -// Thread-related structures -typedef struct { - char **sequences; - int *lengths; - int num_seqs; - int *next_seq; - pthread_mutex_t *mutex; -} ThreadData; - -// Function prototypes -long long calculate_n50(const int *lengths, int num_seqs, long long *total_length); -void *generate_sequences(void *arg); -void gen_ctg_len(int min_len, int max_len, int num_seqs, int *lengths); -void generate_sequence(int length, char *sequence); -void free_resources(char **sequences, int *lengths, int num_seqs); -void write_sequences(char **sequences, int *lengths, int num_seqs, const char *outfile, const char *format); - -int main(int argc, char *argv[]) { - if (argc < 8) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return 1; - } - - int min_seqs = atoi(argv[1]); - int max_seqs = atoi(argv[2]); - int min_len = atoi(argv[3]); - int max_len = atoi(argv[4]); - int tot_files = atoi(argv[5]); - char *format = argv[6]; - char *outdir = argv[7]; - // if specified, the number of threads to use else 1 - int num_threads = 1; - if (argc == 9) { - num_threads = atoi(argv[8]); - } - - - // Input validation - if (min_seqs <= 0 || max_seqs <= 0 || min_len <= 0 || max_len <= 0 || tot_files <= 0 || num_threads <= 0 || num_threads > MAX_THREADS) { - fprintf(stderr, "Error: Invalid input parameters.\n"); - return 1; - } - - if (min_seqs > max_seqs || min_len > max_len) { - fprintf(stderr, "Error: Min values must be less than or equal to max values.\n"); - return 1; - } - - // Convert format to lowercase for case-insensitive comparison - for (char *p = format; *p; ++p) *p = tolower(*p); - if (strcmp(format, "fasta") != 0 && strcmp(format, "fastq") != 0) { - fprintf(stderr, "Error: Invalid format. Use 'fasta' or 'fastq'.\n"); - return 1; - } - - // Log input parameters - fprintf(stderr, "Parameters: min_seqs=%d, max_seqs=%d, min_len=%d, max_len=%d, tot_files=%d, format=%s, outdir=%s, threads=%d\n", - min_seqs, max_seqs, min_len, max_len, tot_files, format, outdir, num_threads); - - int seed = 42; // Fixed seed for reproducibility - srand(seed); - - for (int i = 1; i <= tot_files; i++) { - int num_seqs = rand() % (max_seqs - min_seqs + 1) + min_seqs; - int *contig_lengths = malloc(num_seqs * sizeof(int)); - if (!contig_lengths) { - fprintf(stderr, "Memory allocation failed for contig_lengths\n"); - return 1; - } - fprintf(stderr, "%d/%d %s file (%d num seqs)", i, tot_files, format, num_seqs); - gen_ctg_len(min_len, max_len, num_seqs, contig_lengths); - fprintf(stderr, ", writing:\n"); - long long total_length; - int N50 = calculate_n50(contig_lengths, num_seqs, &total_length); - fprintf(stderr, "\tTotal length: %lld\n", total_length); - fprintf(stderr, "\tN50: %d\n", N50); - char outfile[MAX_FILENAME_LEN]; - snprintf(outfile, sizeof(outfile), "%s/%d_%d_%lld.%s", outdir, N50, num_seqs, total_length, format); - - char **sequences = malloc(num_seqs * sizeof(char *)); - if (!sequences) { - fprintf(stderr, "Memory allocation failed for sequences\n"); - free(contig_lengths); - return 1; - } - - for (int j = 0; j < num_seqs; j++) { - sequences[j] = malloc((contig_lengths[j] + 1) * sizeof(char)); - if (!sequences[j]) { - fprintf(stderr, "Memory allocation failed for sequence %d\n", j); - free_resources(sequences, contig_lengths, j); - return 1; - } - } - - // Set up threading for sequence generation - pthread_t threads[num_threads]; - ThreadData thread_data; - thread_data.sequences = sequences; - thread_data.lengths = contig_lengths; - thread_data.num_seqs = num_seqs; - - int next_seq = 0; - thread_data.next_seq = &next_seq; - - pthread_mutex_t mutex; - pthread_mutex_init(&mutex, NULL); - thread_data.mutex = &mutex; - - for (int t = 0; t < num_threads; t++) { - if (pthread_create(&threads[t], NULL, generate_sequences, (void *)&thread_data) != 0) { - fprintf(stderr, "Failed to create thread %d\n", t); - free_resources(sequences, contig_lengths, num_seqs); - return 1; - } - } - - // Wait for all threads to complete - for (int t = 0; t < num_threads; t++) { - pthread_join(threads[t], NULL); - } - - pthread_mutex_destroy(&mutex); - - // Write sequences to file (single-threaded) - write_sequences(sequences, contig_lengths, num_seqs, outfile, format); - - fprintf(stderr, " [Done: %s]\n", outfile); - free_resources(sequences, contig_lengths, num_seqs); - } - - return 0; -} - -void gen_ctg_len(int min_len, int max_len, int num_seqs, int *lengths) { - // calculate the even distance between num_seqs sequences spanning from min_len to max_len - long long STEP = (max_len - min_len) / num_seqs; - for (int i = 0; i < num_seqs; i++) { - lengths[i] = STEP * i + min_len; - } -} - -int compare_ints(const void *a, const void *b) { - return (*(int*)b - *(int*)a); -} -long long calculate_n50(const int *lengths, int num_seqs, long long *total_length) { - int sorted_lengths[num_seqs]; - memcpy(sorted_lengths, lengths, sizeof(int) * num_seqs); - - qsort(sorted_lengths, num_seqs, sizeof(int), compare_ints); - - *total_length = 0; - for (int i = 0; i < num_seqs; i++) { - *total_length += sorted_lengths[i]; - } - - long long cumulative_length = 0; - for (int i = 0; i < num_seqs; i++) { - cumulative_length += sorted_lengths[i]; - if (cumulative_length >= *total_length / 2) { - return sorted_lengths[i]; - } - } - - return -1; // Should never reach this point -} - -int *generate_contigs(int N50, int SUM_LEN, int TOT_SEQS) { - if (N50 > SUM_LEN || TOT_SEQS < 1) { - fprintf(stderr, "Invalid input: N50 must be <= SUM_LEN and TOT_SEQS >= 1\n"); - return NULL; - } - - int *contig_list = malloc(TOT_SEQS * sizeof(int)); - if (!contig_list) { - fprintf(stderr, "Memory allocation failed\n"); - return NULL; - } - - // calculate int STEP defined as MAX_LEN - int MAX_LEN = rand() % (SUM_LEN - N50 + 1) + N50; - int TMP_SUM = MAX_LEN; - contig_list[0] = MAX_LEN; - int count = 1; - - while (contig_list[count - 1] > N50 && count < TOT_SEQS) { - int next_contig = rand() % (int)(contig_list[count - 1] * 0.9) + 1; - contig_list[count++] = next_contig; - TMP_SUM += next_contig; - } - - if (count < TOT_SEQS && contig_list[count - 1] > N50) { - contig_list[count - 1] = N50; - } - - while (count < TOT_SEQS) { - int remaining_sum = SUM_LEN - TMP_SUM; - if (remaining_sum <= 0) break; - int next_contig = rand() % MAX_LEN + 1; - next_contig = (remaining_sum < next_contig) ? remaining_sum : next_contig; - contig_list[count++] = next_contig; - TMP_SUM += next_contig; - } - - return contig_list; -} - - -void *generate_sequences(void *arg) { - ThreadData *data = (ThreadData *)arg; - int start, end; - - while (1) { - pthread_mutex_lock(data->mutex); - start = *data->next_seq; - end = start + CHUNK_SIZE; - if (end > data->num_seqs) end = data->num_seqs; - *data->next_seq = end; - pthread_mutex_unlock(data->mutex); - - if (start >= data->num_seqs) break; - - for (int i = start; i < end; i++) { - generate_sequence(data->lengths[i], data->sequences[i]); - } - } - - return NULL; -} - -void write_sequences(char **sequences, int *lengths, int num_seqs, const char *outfile, const char *format) { - FILE *f = fopen(outfile, "w"); - if (!f) { - fprintf(stderr, "Unable to write to file: %s\n", outfile); - return; - } - - if (strcmp(format, "fasta") == 0) { - for (int i = 0; i < num_seqs; i++) { - fprintf(f, ">seq_%d\n", i + 1); - for (int j = 0; j < lengths[i]; j += 60) { - int line_length = (lengths[i] - j < 60) ? lengths[i] - j : 60; - fprintf(f, "%.*s\n", line_length, sequences[i] + j); - } - } - } else { - for (int i = 0; i < num_seqs; i++) { - fprintf(f, "@seq%d\n%s\n+\n", i + 1, sequences[i]); - for (int j = 0; j < lengths[i]; j++) { - fputc('I', f); // Dummy quality score - } - fputc('\n', f); - } - } - - fclose(f); -} -void generate_sequence(int length, char *sequence) { - memset(sequence, 'A', length); - sequence[length] = '\0'; -} - -void write_fasta(char **sequences, int *lengths, int num_seqs, const char *outfile) { - FILE *f = fopen(outfile, "w"); - if (!f) { - fprintf(stderr, "Unable to write to file: %s\n", outfile); - return; - } - - for (int i = 0; i < num_seqs; i++) { - fprintf(f, ">seq%d\n", i + 1); - for (int j = 0; j < lengths[i]; j += 60) { - int line_length = (lengths[i] - j < 60) ? lengths[i] - j : 60; - fprintf(f, "%.*s\n", line_length, sequences[i] + j); - } - } - - fclose(f); -} - -void write_fastq(char **sequences, int *lengths, int num_seqs, const char *outfile) { - FILE *f = fopen(outfile, "w"); - if (!f) { - fprintf(stderr, "Unable to write to file: %s\n", outfile); - return; - } - - for (int i = 0; i < num_seqs; i++) { - fprintf(f, "@seq%d\n%s\n+\n", i + 1, sequences[i]); - for (int j = 0; j < lengths[i]; j++) { - fputc('I', f); // Dummy quality score - } - fputc('\n', f); - } - - fclose(f); -} - -void free_resources(char **sequences, int *lengths, int num_seqs) { - for (int i = 0; i < num_seqs; i++) { - free(sequences[i]); - } - free(sequences); - free(lengths); -} \ No newline at end of file diff --git a/src/test.py b/src/generator_test.py similarity index 80% rename from src/test.py rename to src/generator_test.py index 274054e..7115b71 100644 --- a/src/test.py +++ b/src/generator_test.py @@ -1,5 +1,13 @@ +#!/usr/bin/env python import random - +""" +This script contains a function to generate a list of contigs (sub-sequences) +based on given parameters. + + Example: + contigs = generate_contigs(100, 1000, 10) + print(contigs) +""" def gen_ctg_len(min_len, max_len, num_seqs): """ Generate a list of contig lengths. @@ -20,6 +28,22 @@ def calculate_n50(lengths): return (length, len(lengths), total_length) def generate_contigs(N50, SUM_LEN, TOT_SEQS): + """ + This module contains a function to generate a list of contigs (sub-sequences) based on given parameters. + The function ensures that the sum of the contigs equals a specified total length (SUM_LEN) and that the + number of contigs equals a specified total number of sequences (TOT_SEQS). The function also ensures that + the largest contig is at least as large as a specified N50 value. + Functions: + generate_contigs(N50, SUM_LEN, TOT_SEQS): Generates a list of contigs based on the given parameters. + Args: + N50 (int): The minimum length of the largest contig. + SUM_LEN (int): The total length of all contigs combined. + TOT_SEQS (int): The total number of contigs to generate. + Returns: + list: A list of integers representing the lengths of the contigs. + Raises: + ValueError: If N50 is greater than SUM_LEN or TOT_SEQS is less than 1. + """ if N50 > SUM_LEN or TOT_SEQS < 1: raise ValueError("Invalid input: N50 must be <= SUM_LEN and TOT_SEQS >= 1") @@ -101,7 +125,7 @@ def generate_sequences(lengths, format, outfile): if __name__ == "__main__": import argparse import sys, os - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(help="Generate random sequences") parser.add_argument("--min-seqs", default=15, type=int, help="Minimum number of sequences") parser.add_argument("--max-seqs", default=5000, type=int, help="Maximum number of sequences") parser.add_argument("--min-len", default=12, type=int, help="Minimum length for a sequence") diff --git a/src/n50.bak b/src/n50.bak deleted file mode 100644 index 4a796cc..0000000 --- a/src/n50.bak +++ /dev/null @@ -1,352 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#define BUFFER_SIZE 4 * 1024 * 1024 // buffer -#define MAX_THREADS 16 -#define INITIAL_CAPACITY 1000000 -#define VERSION "1.9.0" - -typedef struct { - int *lengths; - int start; - int end; - long long total_length; -} ThreadData; - -typedef struct { - char *filename; - long long total_length; - int length_count; - int n50; - int max_length; - int min_length; - bool is_fastq; - long long bases[5]; // A, C, G, T, Other -} FileStats; - -void print_usage(const char *program_name) { - printf("Usage: %s [OPTIONS] [FILENAME...]\n", program_name); - printf("\nOptions:\n"); - printf(" -a, --fasta Force FASTA input format\n"); - printf(" -q, --fastq Force FASTQ input format\n"); - printf(" -H, --header Print header in output\n"); - printf(" -n, --n50 Output only N50 value\n"); - printf(" -x, --extra Output extra statistics\n"); - printf(" -h, --help Display this help message and exit\n"); - printf(" --version Display version information and exit\n"); - printf("\nDescription:\n"); - printf(" Calculate N50 and other sequence statistics from FASTA or FASTQ files.\n"); - printf(" Supports multiple input files and automatic format detection.\n"); -} - -void print_version() { - printf("N50 Calculator version %s\n", VERSION); - printf("Copyright (C) 2024 Andrea Telatin\n"); - printf("License: MIT\n"); -} - -int compare(const void *a, const void *b) { - return (*(int*)b - *(int*)a); -} - -void *process_chunk(void *arg) { - ThreadData *data = (ThreadData*)arg; - long long local_total_length = 0; - int *local_lengths = malloc((data->end - data->start) * sizeof(int)); - int local_count = 0; - - for (int i = data->start; i < data->end; i++) { - local_total_length += data->lengths[i]; - local_lengths[local_count++] = data->lengths[i]; - } - - ThreadData *result = malloc(sizeof(ThreadData)); - result->lengths = local_lengths; - result->start = data->start; - result->end = data->end; - result->total_length = local_total_length; - - return result; -} - - -// ... (keep other functions unchanged) - -FileStats process_file(const char *filename, bool force_fasta, bool force_fastq, bool extra) { - gzFile fp = gzopen(filename, "r"); - if (!fp) { - fprintf(stderr, "Error: Cannot open file %s\n", filename); - exit(1); - } - - char buffer[BUFFER_SIZE]; - int *chunk_lengths = NULL; - int chunk_count = 0; - int chunk_capacity = INITIAL_CAPACITY; - int current_length = 0; - int min = -1; - int max = 0; - long long bases[5] = {0}; // A, C, G, T, Other - - bool is_fastq = force_fastq || (!force_fasta && (strstr(filename, ".fastq") != NULL || strstr(filename, ".fq") != NULL)); - - chunk_lengths = malloc(chunk_capacity * sizeof(int)); - - if (is_fastq) { - int line_count = 0; - while (gzgets(fp, buffer, BUFFER_SIZE) != NULL) { - line_count++; - if (line_count % 4 == 2) { // Sequence line - int len = strcspn(buffer, "\n"); // Count characters until newline - if (chunk_count == chunk_capacity) { - chunk_capacity *= 2; - chunk_lengths = realloc(chunk_lengths, chunk_capacity * sizeof(int)); - } - chunk_lengths[chunk_count++] = len; - if (extra) { - if (len > max) { - max = len; - } - if (min == -1 || len < min) { - min = len; - } - for (int i = 0; i < len; i++) { - char c = toupper(buffer[i]); - switch (c) { - case 'A': bases[0]++; break; - case 'C': bases[1]++; break; - case 'G': bases[2]++; break; - case 'T': bases[3]++; break; - default: bases[4]++; - } - } - } - } - } - } else { - bool in_sequence = false; - while (gzgets(fp, buffer, BUFFER_SIZE) != NULL) { - if (buffer[0] == '>') { - if (current_length > 0) { - if (chunk_count == chunk_capacity) { - chunk_capacity *= 2; - chunk_lengths = realloc(chunk_lengths, chunk_capacity * sizeof(int)); - } - chunk_lengths[chunk_count++] = current_length; - if (extra) { - if (current_length > max) { - max = current_length; - } - if (min == -1 || current_length < min) { - min = current_length; - } - } - current_length = 0; - } - in_sequence = true; - } else if (in_sequence) { - for (char *p = buffer; *p != '\0'; p++) { - if (isalpha(*p)) { - current_length++; - if (extra) { - char c = toupper(*p); - switch (c) { - case 'A': bases[0]++; break; - case 'C': bases[1]++; break; - case 'G': bases[2]++; break; - case 'T': bases[3]++; break; - default: bases[4]++; - } - } - } - } - } - } - if (current_length > 0) { - if (chunk_count == chunk_capacity) { - chunk_capacity *= 2; - chunk_lengths = realloc(chunk_lengths, chunk_capacity * sizeof(int)); - } - chunk_lengths[chunk_count++] = current_length; - if (extra) { - if (current_length > max) { - max = current_length; - } - if (min == -1 || current_length < min) { - min = current_length; - } - } - } - } - - gzclose(fp); - - // Process chunks using threads - int num_threads = (chunk_count < MAX_THREADS) ? chunk_count : MAX_THREADS; - pthread_t threads[MAX_THREADS]; - ThreadData thread_data[MAX_THREADS]; - int chunk_size = chunk_count / num_threads; - - for (int i = 0; i < num_threads; i++) { - thread_data[i].lengths = chunk_lengths; - thread_data[i].start = i * chunk_size; - thread_data[i].end = (i == num_threads - 1) ? chunk_count : (i + 1) * chunk_size; - pthread_create(&threads[i], NULL, process_chunk, &thread_data[i]); - } - - long long total_length = 0; - int *lengths = malloc(chunk_count * sizeof(int)); - int length_count = 0; - - for (int i = 0; i < num_threads; i++) { - ThreadData *result; - pthread_join(threads[i], (void**)&result); - total_length += result->total_length; - memcpy(lengths + length_count, result->lengths, (result->end - result->start) * sizeof(int)); - length_count += result->end - result->start; - free(result->lengths); - free(result); - } - - free(chunk_lengths); - - // Sort lengths in descending order - qsort(lengths, length_count, sizeof(int), compare); - - // Calculate N50 - long long cumulative_length = 0; - long long half_total_length = total_length / 2; - int n50 = 0; - for (int i = 0; i < length_count; i++) { - cumulative_length += lengths[i]; - if (cumulative_length >= half_total_length) { - n50 = lengths[i]; - break; - } - } - - free(lengths); - - FileStats stats = { - .filename = strdup(filename), - .total_length = total_length, - .length_count = length_count, - .n50 = n50, - .is_fastq = is_fastq, - .max_length = max, - .min_length = min - }; - - if (extra) { - for (int i = 0; i < 5; i++) { - stats.bases[i] = bases[i]; - } - } - - return stats; -} - -int main(int argc, char *argv[]) { - bool opt_header = false; - bool opt_n50 = false; - bool force_fasta = false; - bool force_fastq = false; - bool extra = false; - static struct option long_options[] = { - {"fasta", no_argument, 0, 'a'}, - {"fastq", no_argument, 0, 'q'}, - {"header", no_argument, 0, 'H'}, - {"n50", no_argument, 0, 'n'}, - {"help", no_argument, 0, 'h'}, - {"version", no_argument, 0, 'v'}, - {"extra", no_argument, 0, 'x'}, - {0, 0, 0, 0} - }; - - int opt; - while ((opt = getopt_long(argc, argv, "aqHnhvx", long_options, NULL)) != -1) { - switch (opt) { - case 'a': - force_fasta = true; - break; - case 'q': - force_fastq = true; - break; - case 'H': - opt_header = true; - break; - case 'n': - opt_n50 = true; - break; - case 'h': - print_usage(argv[0]); - return 0; - case 'v': - print_version(); - return 0; - case 'x': - extra = true; - break; - default: - fprintf(stderr, "Try '%s --help' for more information.\n", argv[0]); - return 1; - } - } - - if (optind == argc) { - fprintf(stderr, "Error: No input files specified\n"); - fprintf(stderr, "Try '%s --help' for more information.\n", argv[0]); - return 1; - } - -if (opt_header && !opt_n50) { - if (!extra) { - printf("Filename\tFormat\tTotal_Length\tTotal_Sequences\tN50\n"); - } else { - printf("Filename\tFormat\tTotal_Length\tTotal_Sequences\tN50\tMin\tMax\tA\tC\tG\tT\tOther\n"); - } - } - - for (int i = optind; i < argc; i++) { - FileStats stats = process_file(argv[i], force_fasta, force_fastq, extra); - - if (opt_n50) { - printf("%s\t%d\n", stats.filename, stats.n50); - } else { - if (extra) { - // prepare base stats as ratio on the total of bases, printing them as float with 2 decimal digits - double total_bases = stats.bases[0] + stats.bases[1] + stats.bases[2] + stats.bases[3] + stats.bases[4]; - double base_ratios[5]; - for (int i = 0; i < 5; i++) { - base_ratios[i] = (double)stats.bases[i] / total_bases * 100; - } - printf("%s\t%s\t%lld\t%d\t%d\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n", - stats.filename, - stats.is_fastq ? "FASTQ" : "FASTA", - stats.total_length, - stats.length_count, - stats.n50, - stats.min_length, - stats.max_length, - base_ratios[0], - base_ratios[1], - base_ratios[2], - base_ratios[3], - base_ratios[4]); - } else { - printf("%s\t%s\t%lld\t%d\t%d\n", stats.filename, stats.is_fastq ? "FASTQ" : "FASTA", stats.total_length, stats.length_count, stats.n50); - } - } - - free(stats.filename); - - } - - return 0; -} diff --git a/src/n50_binner.c b/src/n50_binner.c index 440d18c..4484355 100644 --- a/src/n50_binner.c +++ b/src/n50_binner.c @@ -8,7 +8,7 @@ // Function to determine the bin for a given length int get_bin(int length) { - int bins[] = {10, 100, 1000, 2500, 5000, 10000, 20000, 35000, 50000, 75000, 100000, 200000, 300000, 500000, 750000, 1000000}; + int bins[] = {10, 100, 1000, 2500, 5000, 10000, 20000, 30000, 40000, 50000, 75000, 100000, 200000, 300000, 500000, 600000, 700000, 800000, 900000, 1000000, 1500000, 2500000, 5000000}; for (int i = 0; i < NUM_BINS; i++) { if (length <= bins[i]) { return i; diff --git a/src/n50_generate.c b/src/n50_generate.c new file mode 100644 index 0000000..e880d9d --- /dev/null +++ b/src/n50_generate.c @@ -0,0 +1,211 @@ +#include +#include +#include +#include +#include +#include +#include + +#define MAX_LINE_LENGTH 1024 +#define MAX_COMMAND_LENGTH 4096 + +void print_usage(const char *program_name) { + fprintf(stderr, "Usage: %s -i INPUTFILE -o OUTDIR [-f FORMAT] [-s PATH]\n", program_name); + fprintf(stderr, " -i INPUTFILE Input file path\n"); + fprintf(stderr, " -o OUTDIR Output directory\n"); + fprintf(stderr, " -f FORMAT Optional: Output format (FASTQ by default, FASTA also supported)\n"); + fprintf(stderr, " -s PATH Optional: Path to n50_simreads executable\n"); +} + +char *get_executable_path(const char *argv0) { + char *path = strdup(argv0); + if (!path) { + perror("Memory allocation failed"); + exit(EXIT_FAILURE); + } + char *dir = dirname(path); + char *full_path = malloc(strlen(dir) + strlen("/n50_simreads") + 1); + if (!full_path) { + perror("Memory allocation failed"); + free(path); + exit(EXIT_FAILURE); + } + sprintf(full_path, "%s/n50_simreads", dir); + free(path); + return full_path; +} + +void process_input_file(const char *input_file, char *generated_string, size_t max_length) { + FILE *file = fopen(input_file, "r"); + if (!file) { + perror("Error opening input file"); + exit(EXIT_FAILURE); + } + + char line[MAX_LINE_LENGTH]; + int line_number = 0; + size_t current_length = 0; + + while (fgets(line, sizeof(line), file)) { + line_number++; + if (line_number == 1) continue; // Skip header + + char *token = strtok(line, ","); + if (!token) continue; + + int length = atoi(token); + token = strtok(NULL, ","); + if (!token) continue; + + int count = atoi(token); + + if (length <= 0 || count <= 0) { + fprintf(stderr, "Warning: Invalid data on line %d, skipping\n", line_number); + continue; + } + + int written = snprintf(generated_string + current_length, max_length - current_length, + "%d*%d ", count, length); + + if (written < 0 || written >= max_length - current_length) { + fprintf(stderr, "Error: Generated string too long\n"); + fclose(file); + exit(EXIT_FAILURE); + } + + current_length += written; + } + + if (current_length > 0) { + generated_string[current_length - 1] = '\0'; // Remove trailing space + } + + fclose(file); +} + +void run_n50_simreads(const char *format, const char *outdir, const char *generated_string, const char *n50_simreads_path) { + char command[MAX_COMMAND_LENGTH]; + snprintf(command, sizeof(command), "%s --%s -o %s %s", + n50_simreads_path, format, outdir, generated_string); + + printf("Executing command: %s\n", command); + + int result = system(command); + if (result == -1) { + perror("Error executing n50_simreads command"); + exit(EXIT_FAILURE); + } else if (WIFEXITED(result) && WEXITSTATUS(result) != 0) { + fprintf(stderr, "n50_simreads command failed with exit status %d\n", WEXITSTATUS(result)); + exit(EXIT_FAILURE); + } +} + +void calculate_stats(const char *input_file) { + FILE *file = fopen(input_file, "r"); + if (!file) { + perror("Error opening input file for stats calculation"); + exit(EXIT_FAILURE); + } + + char line[MAX_LINE_LENGTH]; + int line_number = 0; + long long total_reads = 0; + int max_length = 0; + + while (fgets(line, sizeof(line), file)) { + line_number++; + if (line_number == 1) continue; // Skip header + + char *token = strtok(line, ","); + if (!token) continue; + + int length = atoi(token); + token = strtok(NULL, ","); + if (!token) continue; + + int count = atoi(token); + + if (length <= 0 || count <= 0) { + fprintf(stderr, "Warning: Invalid data on line %d, skipping\n", line_number); + continue; + } + + total_reads += count; + if (length > max_length) { + max_length = length; + } + } + + fclose(file); + + printf("Total number of reads: %lld\n", total_reads); + printf("Maximum read length: %d\n", max_length); +} + +int main(int argc, char *argv[]) { + char *input_file = NULL; + char *outdir = NULL; + char *format = "FASTQ"; + char *n50_simreads_path = NULL; + + int opt; + while ((opt = getopt(argc, argv, "i:o:f:s:h")) != -1) { + switch (opt) { + case 'i': + input_file = optarg; + break; + case 'o': + outdir = optarg; + break; + case 'f': + format = optarg; + break; + case 's': + n50_simreads_path = optarg; + break; + case 'h': + print_usage(argv[0]); + exit(EXIT_SUCCESS); + default: + print_usage(argv[0]); + exit(EXIT_FAILURE); + } + } + + if (!input_file || !outdir) { + fprintf(stderr, "Error: Input file and output directory are required.\n"); + print_usage(argv[0]); + exit(EXIT_FAILURE); + } + + // Convert format to uppercase + for (char *p = format; *p; ++p) *p = toupper(*p); + + // Validate format + if (strcmp(format, "FASTQ") != 0 && strcmp(format, "FASTA") != 0) { + fprintf(stderr, "Error: Invalid format. Use FASTQ or FASTA.\n"); + exit(EXIT_FAILURE); + } + + // Set default n50_simreads path if not provided + if (!n50_simreads_path) { + n50_simreads_path = get_executable_path(argv[0]); + } + + // Process input file + char generated_string[MAX_COMMAND_LENGTH]; + process_input_file(input_file, generated_string, sizeof(generated_string)); + + // Run n50_simreads + run_n50_simreads(format, outdir, generated_string, n50_simreads_path); + + // Calculate and print stats + calculate_stats(input_file); + + // Free allocated memory + if (n50_simreads_path != NULL && n50_simreads_path != get_executable_path(argv[0])) { + free(n50_simreads_path); + } + + return 0; +} \ No newline at end of file diff --git a/src/n50_simreads.bak b/src/n50_simreads.bak deleted file mode 100644 index 5212967..0000000 --- a/src/n50_simreads.bak +++ /dev/null @@ -1,91 +0,0 @@ -#include -#include -#include -#include -#include - -#define MAX_ARGS 100 - -// Function to generate a random DNA sequence -void generate_sequence(char *seq, long length) { - const char bases[] = "ACGT"; - for (long i = 0; i < length; i++) { - seq[i] = bases[rand() % 4]; - } - seq[length] = '\0'; -} - -// Function to generate a random quality string -void generate_quality(char *qual, long length) { - for (long i = 0; i < length; i++) { - qual[i] = 33 + (rand() % 41); // ASCII 33 to 73 - } - qual[length] = '\0'; -} - -// Function to parse size string (e.g., "1kb", "2Mb") -long parse_size(const char *size_str) { - long size = atol(size_str); - char suffix = toupper(size_str[strlen(size_str) - 1]); - - switch (suffix) { - case 'K': size *= 1000; break; - case 'M': size *= 1000000; break; - case 'G': size *= 1000000000; break; - } - - return size; -} - -int main(int argc, char *argv[]) { - if (argc < 3) { - fprintf(stderr, "Usage: %s [--fasta|--fastq] ARGS\n", argv[0]); - return 1; - } - - int is_fastq = 0; - if (strcmp(argv[1], "--fastq") == 0) { - is_fastq = 1; - } else if (strcmp(argv[1], "--fasta") != 0) { - fprintf(stderr, "Invalid format option. Use --fasta or --fastq.\n"); - return 1; - } - - srand(time(NULL)); - - for (int i = 2; i < argc; i++) { - char *arg = argv[i]; - char *count_str = strtok(arg, "*"); - char *size_str = strtok(NULL, "*"); - - if (!count_str || !size_str) { - fprintf(stderr, "Invalid argument format: %s\n", arg); - continue; - } - - int count = atoi(count_str); - long size = parse_size(size_str); - - char *sequence = malloc(size + 1); - char *quality = NULL; - if (is_fastq) { - quality = malloc(size + 1); - } - - for (int j = 0; j < count; j++) { - generate_sequence(sequence, size); - - if (is_fastq) { - generate_quality(quality, size); - printf("@Simulated_read_%d\n%s\n+\n%s\n", j+1, sequence, quality); - } else { - printf(">Simulated_read_%d\n%s\n", j+1, sequence); - } - } - - free(sequence); - if (quality) free(quality); - } - - return 0; -} \ No newline at end of file diff --git a/src/n60.c b/src/n60.c deleted file mode 100644 index 3e328bd..0000000 --- a/src/n60.c +++ /dev/null @@ -1,226 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -#define BUFFER_SIZE 1024 * 1024 // 1MB buffer -#define MAX_THREADS 8 -#define INITIAL_CAPACITY 1000000 - -typedef struct { - int *lengths; - int start; - int end; - long long total_length; -} ThreadData; - -typedef struct { - char *filename; - long long total_length; - int length_count; - int n50; - bool is_fastq; -} FileStats; - -int compare(const void *a, const void *b) { - return (*(int*)b - *(int*)a); -} - -void *process_chunk(void *arg) { - ThreadData *data = (ThreadData*)arg; - long long local_total_length = 0; - int *local_lengths = malloc((data->end - data->start) * sizeof(int)); - int local_count = 0; - - for (int i = data->start; i < data->end; i++) { - local_total_length += data->lengths[i]; - local_lengths[local_count++] = data->lengths[i]; - } - - ThreadData *result = malloc(sizeof(ThreadData)); - result->lengths = local_lengths; - result->start = data->start; - result->end = data->end; - result->total_length = local_total_length; - - return result; -} - -FileStats process_file(const char *filename, bool force_fasta, bool force_fastq) { - gzFile fp = gzopen(filename, "r"); - if (!fp) { - fprintf(stderr, "Error: Cannot open file %s\n", filename); - exit(1); - } - - char buffer[BUFFER_SIZE]; - int *chunk_lengths = NULL; - int chunk_count = 0; - int chunk_capacity = INITIAL_CAPACITY; - int current_length = 0; - bool is_fastq = force_fastq || (!force_fasta && (strstr(filename, ".fastq") != NULL || strstr(filename, ".fq") != NULL)); - - chunk_lengths = malloc(chunk_capacity * sizeof(int)); - - if (is_fastq) { - int line_count = 0; - while (gzgets(fp, buffer, BUFFER_SIZE) != NULL) { - line_count++; - if (line_count % 4 == 2) { // Sequence line - int len = strcspn(buffer, "\n"); // Count characters until newline - if (chunk_count == chunk_capacity) { - chunk_capacity *= 2; - chunk_lengths = realloc(chunk_lengths, chunk_capacity * sizeof(int)); - } - chunk_lengths[chunk_count++] = len; - } - } - } else { - while (gzgets(fp, buffer, BUFFER_SIZE) != NULL) { - if (buffer[0] == '>') { - if (current_length > 0) { - if (chunk_count == chunk_capacity) { - chunk_capacity *= 2; - chunk_lengths = realloc(chunk_lengths, chunk_capacity * sizeof(int)); - } - chunk_lengths[chunk_count++] = current_length; - current_length = 0; - } - } else { - for (char *p = buffer; *p != '\0'; p++) { - if (isalpha(*p)) { - current_length++; - } - } - } - } - if (current_length > 0) { - if (chunk_count == chunk_capacity) { - chunk_capacity *= 2; - chunk_lengths = realloc(chunk_lengths, chunk_capacity * sizeof(int)); - } - chunk_lengths[chunk_count++] = current_length; - } - } - - gzclose(fp); - - // Process chunks using threads - int num_threads = (chunk_count < MAX_THREADS) ? chunk_count : MAX_THREADS; - pthread_t threads[MAX_THREADS]; - ThreadData thread_data[MAX_THREADS]; - int chunk_size = chunk_count / num_threads; - - for (int i = 0; i < num_threads; i++) { - thread_data[i].lengths = chunk_lengths; - thread_data[i].start = i * chunk_size; - thread_data[i].end = (i == num_threads - 1) ? chunk_count : (i + 1) * chunk_size; - pthread_create(&threads[i], NULL, process_chunk, &thread_data[i]); - } - - long long total_length = 0; - int *lengths = malloc(chunk_count * sizeof(int)); - int length_count = 0; - - for (int i = 0; i < num_threads; i++) { - ThreadData *result; - pthread_join(threads[i], (void**)&result); - total_length += result->total_length; - memcpy(lengths + length_count, result->lengths, (result->end - result->start) * sizeof(int)); - length_count += result->end - result->start; - free(result->lengths); - free(result); - } - - free(chunk_lengths); - - // Sort lengths in descending order - qsort(lengths, length_count, sizeof(int), compare); - - // Calculate N50 - long long cumulative_length = 0; - long long half_total_length = total_length / 2; - int n50 = 0; - for (int i = 0; i < length_count; i++) { - cumulative_length += lengths[i]; - if (cumulative_length >= half_total_length) { - n50 = lengths[i]; - break; - } - } - - free(lengths); - - FileStats stats = { - .filename = strdup(filename), - .total_length = total_length, - .length_count = length_count, - .n50 = n50, - .is_fastq = is_fastq - }; - - return stats; -} - -int main(int argc, char *argv[]) { - bool opt_header = false; - bool opt_n50 = false; - bool force_fasta = false; - bool force_fastq = false; - - static struct option long_options[] = { - {"fasta", no_argument, 0, 'a'}, - {"fastq", no_argument, 0, 'q'}, - {"header", no_argument, 0, 'h'}, - {"n50", no_argument, 0, 'n'}, - {0, 0, 0, 0} - }; - - int opt; - while ((opt = getopt_long(argc, argv, "aqhn", long_options, NULL)) != -1) { - switch (opt) { - case 'a': - force_fasta = true; - break; - case 'q': - force_fastq = true; - break; - case 'h': - opt_header = true; - break; - case 'n': - opt_n50 = true; - break; - default: - fprintf(stderr, "Usage: %s [--fasta | --fastq] [--header] [--n50] [filename...]\n", argv[0]); - return 1; - } - } - - if (optind == argc) { - fprintf(stderr, "Error: No input files specified\n"); - return 1; - } - - if (opt_header && !opt_n50) { - printf("Filename\tFormat\tTotal_Length\tTotal_Sequences\tN50\n"); - } - - for (int i = optind; i < argc; i++) { - FileStats stats = process_file(argv[i], force_fasta, force_fastq); - - if (opt_n50) { - printf("%s\t%d\n", stats.filename, stats.n50); - } else { - printf("%s\t%s\t%lld\t%d\t%d\n", stats.filename, stats.is_fastq ? "FASTQ" : "FASTA", stats.total_length, stats.length_count, stats.n50); - } - - free(stats.filename); - } - - return 0; -} diff --git a/test/dirbench.py b/test/dirbench.py new file mode 100755 index 0000000..674fa08 --- /dev/null +++ b/test/dirbench.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python +""" +A program to benchmark N50 calculations tools +using hyperfine. +""" + +import os +import sys +import subprocess + +def check_deps(verbose): + """ + Check if hyperfine is installed. + """ + test_cmds = [ + ['hyperfine', '--version'], + ['seqfu', '--version'], + ['n50', '--version'], + ['seqkit', 'version'], + ] + + for cmd in test_cmds: + try: + subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + if verbose: + print(f"Command {cmd[0]} found.") + except FileNotFoundError: + print(f"Command {cmd[0]} not found. Exiting.") + sys.exit(1) + +def split_files_type(files): + """ + Split the list of files into a dictionary of lists + """ + + results = { + 'fasta' : [], + 'fastq' : [], + 'fasta_gz' : [], + 'fastq_gz' : [], + } + exts = { + '.fasta' : 'fasta', + '.fastq' : 'fastq', + '.fasta.gz' : 'fasta_gz', + '.fastq.gz' : 'fastq_gz', + '.fq' : 'fastq', + '.fq.gz' : 'fastq_gz', + '.fa' : 'fasta', + '.fa.gz' : 'fasta_gz', + '.fna' : 'fasta' + } + for file in files: + for ext in exts: + if file.endswith(ext): + results[exts[ext]].append(file) + + return results + +def gen_cmd(files, params=['--max-runs 5', '--warmup 1']): + cmd = ['hyperfine'] + cmd.extend(params) + progs = ['n50', 'seqfu stats', 'seqkit stats --all'] + for p in progs: + cmd.append(f"{p} {' '.join(files)}") + + print(cmd) + +if __name__ == "__main__": + import argparse + args = argparse.ArgumentParser(description="Benchmark N50 calculation tools using hyperfine.") + args.add_argument("FILES", nargs="+", help="N50 calculation tools to benchmark.") + args.add_argument('-o','--outdir', help="") + args.add_argument("--verbose", action="store_true", help="Print verbose output.") + args = args.parse_args() + check_deps(args.verbose) + file_groups = split_files_type(args.FILES) + for ftype in file_groups: + print(ftype, "\t", len(file_groups[ftype])) + c = gen_cmd(file_groups[ftype]) + + \ No newline at end of file diff --git a/test/test.sh b/test/test.sh index ea27c4b..8c9d0b6 100644 --- a/test/test.sh +++ b/test/test.sh @@ -125,14 +125,14 @@ rm -rf "$RAND_TEMP_DIR" # Hyperfine section # -------------------------------------------------------------------------------- -# if hyperfine and seqkit and seqfu are available go on -# if NO_HYPERFINE is defined, skip -if [ -n "$NO_HYPERFINE" ]; then - echo "INFO Skipping benchmar NO_HYPERFINE is defined" + +if [[ ! -z "${NO_HYPERFINE+x}" ]]; then + echo "Skipping benchmark: hyperfine is set [$NO_HYPERFINE]" exit 0 fi + if [ -z "$(which hyperfine)" ] || [ -z "$(which seqkit)" ] || [ -z "$(which seqfu)" ]; then - echo "Skipping benchmark" + echo "Skipping benchmark: hyperfine, seqkit or seqfu not found" exit 0 fi