-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
🛠️ Modification to bin cobalt outputs and add purity forcing #3
Changes from all commits
a56dd1a
04c403a
e033572
6270a40
b763c14
b4e9645
cb86daf
8bb86cb
80fa8ae
f64a0ae
f4cafa5
4d5f1ef
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
params.cores = 4 | ||
params.cores = 1 | ||
params.memory = '4 GB' | ||
|
||
// Params Defaults in juno | ||
params.refGenome = "/work/isabl/ref/homo_sapiens/GRCh37d5/gr37.fasta" | ||
|
@@ -8,17 +9,26 @@ params.loci = "/data/copy_number/GermlineHetPon.37.vcf.gz" | |
params.gcProfile = "/data/copy_number/GC_profile.1000bp.37.cnp" | ||
params.ensemblDataDir = "/data/common/ensembl_data" | ||
params.diploidRegions = "/data/copy_number/DiploidRegions.37.bed.gz" | ||
params.binProbes = 0 | ||
params.binLogR = 0 | ||
params.minPurity = 0.08 | ||
params.maxPurity = 1.0 | ||
|
||
|
||
log.info """\ | ||
HMFTOOLS - PURPLE | ||
======================================== | ||
Params: | ||
---------------------------------------- | ||
tumor : ${params.tumor} | ||
tumorBam : ${params.tumorBam} | ||
outdir : ${params.outdir} | ||
cores : ${params.cores} | ||
tumor : ${params.tumor} | ||
tumorBam : ${params.tumorBam} | ||
outdir : ${params.outdir} | ||
cores : ${params.cores} | ||
memory : ${params.memory} | ||
binProbes : ${params.binProbes} | ||
binLogR : ${params.binLogR} | ||
minPurity : ${params.minPurity} | ||
maxPurity : ${params.maxPurity} | ||
======================================== | ||
Workflow: | ||
---------------------------------------- | ||
|
@@ -32,7 +42,7 @@ process runAmber { | |
tag "AMBER on ${params.tumor}" | ||
publishDir "${params.outdir}/amber", mode: 'copy' | ||
cpus params.cores | ||
memory '32 GB' | ||
memory params.memory | ||
time '1h' | ||
|
||
input: | ||
|
@@ -46,21 +56,30 @@ process runAmber { | |
|
||
script: | ||
""" | ||
amber \ | ||
-tumor ${tumor} \ | ||
-tumor_bam ${tumorBam} \ | ||
-output_dir \$PWD \ | ||
-threads ${params.cores} \ | ||
-loci ${params.loci} \ | ||
-ref_genome_version ${params.genomeVersion} | ||
if [ -f "${params.outdir}/amber/${tumor}.amber.baf.tsv.gz" ] && \ | ||
[ -f "${params.outdir}/amber/${tumor}.amber.baf.pcf" ] && \ | ||
[ -f "${params.outdir}/amber/${tumor}.amber.qc" ]; then | ||
echo "Output files already exist. Skipping amber execution." | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These checks are managed by nextflow. If nextflow sees the outputs you expect in a previous run, it will cached these step. Was this not happening for you? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is part of the updates for the force -- because the workflow has finished completely the runDir is already copied to the publishDir and it goes to redo these steps since the output files don't exist in the new runDir. |
||
ln -s ${params.outdir}/amber/${tumor}.amber.baf.tsv.gz ${tumor}.amber.baf.tsv.gz | ||
ln -s ${params.outdir}/amber/${tumor}.amber.baf.pcf ${tumor}.amber.baf.pcf | ||
ln -s ${params.outdir}/amber/${tumor}.amber.qc ${tumor}.amber.qc | ||
else | ||
amber \ | ||
-tumor ${tumor} \ | ||
-tumor_bam ${tumorBam} \ | ||
-output_dir \$PWD \ | ||
-threads ${params.cores} \ | ||
-loci ${params.loci} \ | ||
-ref_genome_version ${params.genomeVersion} | ||
fi | ||
""".stripIndent() | ||
} | ||
|
||
process runCobalt { | ||
tag "COBALT on ${params.tumor}" | ||
publishDir "${params.outdir}/cobalt", mode: 'copy' | ||
cpus params.cores | ||
memory '32 GB' | ||
memory params.memory | ||
time '1h' | ||
|
||
input: | ||
|
@@ -73,21 +92,102 @@ process runCobalt { | |
|
||
script: | ||
""" | ||
cobalt \ | ||
if [ -f "${params.outdir}/cobalt/${tumor}.cobalt.ratio.tsv.gz" ] && \ | ||
[ -f "${params.outdir}/cobalt/${tumor}.cobalt.ratio.pcf" ]; then | ||
echo "Output files already exist. Skipping cobalt execution." | ||
ln -s ${params.outdir}/cobalt/${tumor}.cobalt.ratio.tsv.gz ${tumor}.cobalt.ratio.tsv.gz | ||
ln -s ${params.outdir}/cobalt/${tumor}.cobalt.ratio.pcf ${tumor}.cobalt.ratio.pcf | ||
else | ||
cobalt \ | ||
-tumor ${tumor} \ | ||
-tumor_bam ${tumorBam} \ | ||
-output_dir \$PWD \ | ||
-threads ${params.cores} \ | ||
-gc_profile ${params.gcProfile} \ | ||
-tumor_only_diploid_bed ${params.diploidRegions} | ||
fi | ||
""".stripIndent() | ||
} | ||
|
||
process binCobalt { | ||
tag "COBALT BIN on ${params.tumor}" | ||
publishDir "${params.outdir}/cobalt/binned_${params.binProbes}_probes_${params.binLogR}_LogR", mode: 'copy' | ||
cpus params.cores | ||
memory params.memory | ||
time '1h' | ||
|
||
input: | ||
val tumor | ||
val binProbes | ||
val binLogR | ||
path cobalt_ratio_pcf | ||
path cobalt_ratio_tsv | ||
|
||
output: | ||
path "${tumor}.cobalt.ratio.tsv.gz", emit: cobalt_ratio_tsv | ||
path "${tumor}.cobalt.ratio.pcf", emit: cobalt_ratio_pcf | ||
|
||
script: | ||
""" | ||
#!/usr/bin/env python | ||
import pandas as pd | ||
import numpy as np | ||
|
||
cobalt_ratio_pcf = pd.read_csv('${cobalt_ratio_pcf}', sep='\\t') | ||
cobalt_ratio_pcf_probes = pd.DataFrame(columns=cobalt_ratio_pcf.columns) | ||
|
||
# First bin by probes | ||
chrom_arm = None | ||
last_idx = None | ||
for idx, seg in cobalt_ratio_pcf.iterrows(): | ||
if chrom_arm != '_'.join(seg[['chrom','arm']].astype(str)): | ||
chrom_arm = '_'.join(seg[['chrom','arm']].astype(str)) | ||
cobalt_ratio_pcf_probes = pd.concat([cobalt_ratio_pcf_probes, seg.to_frame().T], ignore_index=True) | ||
last_idx = cobalt_ratio_pcf_probes.index[-1] | ||
continue | ||
if ( | ||
cobalt_ratio_pcf_probes.loc[last_idx, 'n.probes'] <= ${binProbes} | ||
) or ( | ||
seg['n.probes'] <= ${binProbes} | ||
): | ||
means = [cobalt_ratio_pcf_probes.loc[last_idx, 'mean']] * cobalt_ratio_pcf_probes.loc[last_idx, 'n.probes'] | ||
means.extend([seg['mean']] * seg['n.probes']) | ||
cobalt_ratio_pcf_probes.loc[last_idx, 'mean'] = np.mean(means) | ||
cobalt_ratio_pcf_probes.loc[last_idx, 'n.probes'] += seg['n.probes'] | ||
cobalt_ratio_pcf_probes.loc[last_idx, 'end.pos'] = seg['end.pos'] | ||
else: | ||
cobalt_ratio_pcf_probes = pd.concat([cobalt_ratio_pcf_probes, seg.to_frame().T], ignore_index=True) | ||
last_idx = cobalt_ratio_pcf_probes.index[-1] | ||
|
||
# Then bin by logR mean | ||
cobalt_ratio_pcf_probes = cobalt_ratio_pcf_probes.reset_index().drop(columns="index") | ||
cobalt_ratio_pcf_probes_logR = pd.DataFrame(columns=cobalt_ratio_pcf_probes.columns) | ||
chrom_arm = None | ||
for idx, seg in cobalt_ratio_pcf_probes.iterrows(): | ||
if chrom_arm != '_'.join(seg[['chrom','arm']].astype(str)): | ||
chrom_arm = '_'.join(seg[['chrom','arm']].astype(str)) | ||
cobalt_ratio_pcf_probes_logR = pd.concat([cobalt_ratio_pcf_probes_logR, seg.to_frame().T], ignore_index=True) | ||
last_idx = cobalt_ratio_pcf_probes_logR.index[-1] | ||
continue | ||
if abs(cobalt_ratio_pcf_probes.loc[last_idx, 'mean'] - seg['mean']) <= ${binLogR}: | ||
means = [cobalt_ratio_pcf_probes_logR.loc[last_idx, 'mean']] * cobalt_ratio_pcf_probes_logR.loc[last_idx, 'n.probes'] | ||
means.extend([seg['mean']] * seg['n.probes']) | ||
cobalt_ratio_pcf_probes_logR.loc[last_idx, 'mean'] = np.mean(means) | ||
cobalt_ratio_pcf_probes_logR.loc[last_idx, 'n.probes'] += seg['n.probes'] | ||
cobalt_ratio_pcf_probes_logR.loc[last_idx, 'end.pos'] = seg['end.pos'] | ||
else: | ||
cobalt_ratio_pcf_probes_logR = pd.concat([cobalt_ratio_pcf_probes_logR, seg.to_frame().T], ignore_index=True) | ||
last_idx = cobalt_ratio_pcf_probes_logR.index[-1] | ||
|
||
cobalt_ratio_pcf_probes_logR.to_csv("${tumor}.cobalt.ratio.pcf", sep='\\t', index=False) | ||
""".stripIndent() | ||
} | ||
|
||
process runPurple { | ||
tag "PURPLE on ${params.tumor}" | ||
publishDir "${params.outdir}/purple", mode: 'copy' | ||
publishDir "${params.outdir}/purple/purple_${params.minPurity}_${params.maxPurity}", mode: 'copy' | ||
cpus params.cores | ||
memory '32 GB' | ||
memory params.memory | ||
time '1h' | ||
|
||
input: | ||
|
@@ -97,6 +197,7 @@ process runPurple { | |
path amber_qc | ||
path cobalt_ratio_tsv | ||
path cobalt_ratio_pcf | ||
path cobalt_path | ||
|
||
output: | ||
path "${tumor}.purple.purity.tsv", emit: purple_purity_tsv | ||
|
@@ -118,21 +219,34 @@ process runPurple { | |
purple \ | ||
-tumor ${tumor} \ | ||
-amber ${params.outdir}/amber \ | ||
-cobalt ${params.outdir}/cobalt \ | ||
-cobalt ${cobalt_path} \ | ||
-output_dir \$PWD \ | ||
-gc_profile ${params.gcProfile} \ | ||
-ref_genome ${params.refGenome} \ | ||
-ref_genome_version ${params.genomeVersion} \ | ||
-ensembl_data_dir ${params.ensemblDataDir} \ | ||
-circos ${params.circos} | ||
-circos ${params.circos} \ | ||
-min_purity ${params.minPurity} \ | ||
-max_purity ${params.maxPurity} | ||
|
||
rsync -a --no-links \$PWD/ ${params.outdir}/purple/ | ||
""".stripIndent() | ||
} | ||
|
||
workflow { | ||
tumor = Channel.value(params.tumor) | ||
tumorBam = Channel.fromPath(params.tumorBam) | ||
|
||
runAmber(tumor, tumorBam) | ||
runCobalt(tumor, tumorBam) | ||
runPurple(tumor, runAmber.out, runCobalt.out) | ||
binProbes = Channel.value(params.binProbes) | ||
binLogR = Channel.value(params.binLogR) | ||
|
||
amberOutput = runAmber(tumor, tumorBam) | ||
cobaltOutput = runCobalt(tumor, tumorBam) | ||
|
||
if (binProbes != 0 || binLogR != 0) { | ||
binCobaltOutput = binCobalt(tumor, binProbes, binLogR, cobaltOutput.cobalt_ratio_pcf, cobaltOutput.cobalt_ratio_tsv) | ||
runPurple(tumor, amberOutput, binCobaltOutput, "${params.outdir}/cobalt/binned_${params.binProbes}_probes_${params.binLogR}_LogR") | ||
} else { | ||
runPurple(tumor, amberOutput, cobaltOutput, "${params.outdir}/cobalt") | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
sampleID chrom arm start.pos end.pos n.probes mean | ||
S1 2 p 14001 397001 340 -7.24 | ||
S1 17 p 1 397001 341 -8.813389396729216 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
nextflow_process { | ||
|
||
name "Test Process binCobalt" | ||
script "main.nf" | ||
process "binCobalt" | ||
|
||
test("Should bin Cobalt without failures") { | ||
|
||
when { | ||
params { | ||
tumor = "TEST" | ||
binProbes = 100 | ||
binLogR = 0.5 | ||
cobalt_ratio_pcf = "${projectDir}/tests/data/TEST.cobalt.ratio.pcf" | ||
cobalt_ratio_tsv = "${projectDir}/tests/data/TEST.cobalt.ratio.tsv.gz" | ||
} | ||
process { | ||
""" | ||
input[0] = Channel.value(params.tumor) | ||
input[1] = Channel.value(params.binProbes) | ||
input[2] = Channel.value(params.binLogR) | ||
input[3] = Channel.fromPath(params.cobalt_ratio_pcf) | ||
input[4] = Channel.fromPath(params.cobalt_ratio_tsv) | ||
""" | ||
} | ||
} | ||
|
||
then { | ||
assert process.success | ||
assert snapshot(process.out).match() | ||
|
||
// check expected files | ||
with(process.out) { | ||
assert cobalt_ratio_tsv.size() == 1 | ||
assert cobalt_ratio_pcf.size() == 1 | ||
} | ||
} | ||
|
||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
"Should bin Cobalt without failures": { | ||
"content": [ | ||
{ | ||
"0": [ | ||
"TEST.cobalt.ratio.tsv.gz:md5,87cf8451b04e4373fc4dd7ccda3e6afe" | ||
], | ||
"1": [ | ||
"TEST.cobalt.ratio.pcf:md5,cd95f6b56cecfd80461bf1bd13c7a9fd" | ||
], | ||
"cobalt_ratio_pcf": [ | ||
"TEST.cobalt.ratio.pcf:md5,cd95f6b56cecfd80461bf1bd13c7a9fd" | ||
], | ||
"cobalt_ratio_tsv": [ | ||
"TEST.cobalt.ratio.tsv.gz:md5,87cf8451b04e4373fc4dd7ccda3e6afe" | ||
] | ||
} | ||
], | ||
"meta": { | ||
"nf-test": "0.9.0", | ||
"nextflow": "24.04.4" | ||
}, | ||
"timestamp": "2024-08-07T11:15:21.745138" | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Im thinking about keeping
32 GB
, and use4 GB
for testing innf.test
:Unless you think it doesn't need that much memory by default?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I ended up adding a memory parameter because it seems sample/system dependent. I prefer to keep it low and then configure it higher on samples or systems where necessary but either way works.