Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FASTQ files to testing operations #3

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 25 additions & 23 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -75,46 +75,44 @@ process MANY_SMALL_FILES {
val num_files

output:
path 'generated_files', emit: files
path "file_*.bin", emit: files
path 'checksum.txt', emit: checksum

script:
"""
mkdir generated_files
for i in \$(seq 1 ${num_files}); do
dd if=/dev/zero of=generated_files/file_\$i.bin bs=1M count=10
dd if=/dev/zero of=file_\$i.bin bs=1M count=10
done

# Generate MD5 checksums
cd generated_files
md5sum * > ../checksum.txt
md5sum file_* > checksum.txt
"""
}

process COUNT_FILES {
input:
path files_folder
path "files/*"

output:
stdout

script:
"""
find ${files_folder}/* -type f | wc -l
find files/* | wc -l
"""
}

process RENAME_FILES {
input:
path files_folder
path "files/*"

output:
path 'renamed_files'

script:
"""
# First, create a copy of the original folder
cp -LR ${files_folder} original_files
cp -LR files original_files

# Now create the renamed_files directory and move files there
mkdir renamed_files
Expand All @@ -126,14 +124,14 @@ process RENAME_FILES {

process COMPRESS_FILES {
input:
path files_folder
path "files/*"

output:
path 'compressed_files.tar.gz'

script:
"""
tar -czvf compressed_files.tar.gz -C \$(readlink -f ${files_folder}) .
tar -czvhf compressed_files.tar.gz -C \$(readlink -f files) .
"""
}

Expand All @@ -146,11 +144,10 @@ process UNCOMPRESS_FILES {
script:
"""
mkdir uncompressed_files
tar -xzvf ${compressed_file} -C uncompressed_files
tar -xzvf ${compressed_file}

# Verify checksums
cd uncompressed_files
md5sum -c ../${original_checksum} > verification_results.txt
md5sum -c ${original_checksum} > verification_results.txt
if grep -q 'FAILED' verification_results.txt; then
echo "Checksum verification FAILED for some files"
exit 1
Expand All @@ -166,28 +163,33 @@ workflow {
generate_params = Channel.from(1..params.num_files).map { it -> tuple(params.total_reads, it) }

// Run GENERATE_FAKE_FASTQ processes in parallel
fake_fastq_files = GENERATE_FAKE_FASTQ(generate_params)
GENERATE_FAKE_FASTQ(generate_params)

// Compress the FASTQ files
fake_fastq_files = COMPRESS_FASTQ(GENERATE_FAKE_FASTQ.out)

// Collect all generated FASTQ files
collected_fastq_files = fake_fastq_files.collect()

// Concatenate all FASTQ files
CONCATENATE_FASTQ(collected_fastq_files)

// Compress the concatenated FASTQ file
COMPRESS_FASTQ(CONCATENATE_FASTQ.out)

// Generate many small files in a single process
small_files = MANY_SMALL_FILES(params.small_files)
MANY_SMALL_FILES(params.small_files)

all_files = Channel.empty()
.mix( MANY_SMALL_FILES.out.files.filter{ params.use_small_files } )
.mix( fake_fastq_files.filter{ params.use_fastq_files } )
.collect()

// Count how many files are generated
COUNT_FILES(small_files.files) | view { "Number of small files: $it" }
COUNT_FILES(all_files) | view { "Number of small files: $it" }

// Rename all these files
RENAME_FILES(small_files.files)
RENAME_FILES(all_files)

compressed = COMPRESS_FILES(small_files.files)
compressed = COMPRESS_FILES(all_files)

UNCOMPRESS_FILES(compressed, small_files.checksum)
// UNCOMPRESS_FILES(compressed, small_files.checksum)

}
12 changes: 7 additions & 5 deletions nextflow.config
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@

params.total_reads = 10000 // Total number of reads per file (10k reads generates a ~1GB file)
params.num_files = 10 // Number of FASTQ files to generate in parallel and concatenate
params.small_files = 1000 // Number of small files to generate in a single process
params.run = null // Tools to selectively run
params.skip = '' // Tools to selectively skip
params.total_reads = 10000 // Total number of reads per file (10k reads generates a ~1GB file)
params.num_files = 10 // Number of FASTQ files to generate in parallel and concatenate
params.small_files = 1000 // Number of small files to generate in a single process
params.run = null // Tools to selectively run
params.skip = '' // Tools to selectively skip
params.use_small_files = true // Use small files in downstream operations
params.use_fastq_files = false // Use FASTQ files in downstream operations
Comment on lines +2 to +8
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
params.total_reads = 10000 // Total number of reads per file (10k reads generates a ~1GB file)
params.num_files = 10 // Number of FASTQ files to generate in parallel and concatenate
params.small_files = 1000 // Number of small files to generate in a single process
params.run = null // Tools to selectively run
params.skip = '' // Tools to selectively skip
params.use_small_files = true // Use small files in downstream operations
params.use_fastq_files = false // Use FASTQ files in downstream operations
params.large_file_types = 'fastq' // Number of FASTQ files to generate in parallel and concatenate
params.small_file_types = 'fastq,dev0' // Number of FASTQ files to generate in parallel and concatenate
params.large_num_files = 10 // Number of FASTQ files to generate in parallel and concatenate
params.small_num_files = 1000 // Number of small files to generate in a single process
params.fastq_total_reads = 10000 // Total number of reads per file (10k reads generates a ~1GB file)
params.run_processes = null // Tools to selectively run
params.skip_processes = '' // Tools to selectively skip
//params.use_small_files = true // Use small files in downstream operations
//params.use_fastq_files = false // Use FASTQ files in downstream operations


process.container = 'quay.io/nextflow/bash'
process.cpus = 4
Expand Down
9 changes: 9 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,15 @@
"skip": {
"type": "string",
"description": "Selectively disable a tool. If this option is enabled a tool will be ignored. Note: this may affect downstream tools."
},
"use_small_files": {
"type": "boolean",
"default": true,
"description": "Add small files to downstream operations tests (e.g. count, rename, compress)."
},
"use_fastq_files": {
"type": "boolean",
"description": "Use FASTQ files for downstream operations tests (e.g. count, rename, compress)."
}
}
}
Expand Down