-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSigProfilerJulia.jl
183 lines (145 loc) · 6.08 KB
/
SigProfilerJulia.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
using DelimitedFiles
using LinearAlgebra
using Clustering
using StatsBase
using Distributed
using DataFrames
using CSV
import ClusterManagers
using LinearAlgebra
using Distances
using Random
using Base
using ArgParse
function ParseCommands()
s = ArgParseSettings()
@add_arg_table s begin
"--mutation_file", "-f"
help = "File with the matrix count"
required = true
"--min_signatures_to_extract", "-s"
help = "Minimum number of signatures to extract"
arg_type = Int
required = true
"--max_signatures_to_extract", "-m"
help = "Maximum number of signatures to extract"
arg_type = Int
required = true
"--outpath", "-o"
help = "Folder where the results will be saved"
required = true
"--iterations", "-i"
help = "Total NMF iterations performed"
arg_type = Int
default = 1024
"--workers", "-n"
help = "Number of slurm parallel process or local cores"
arg_type = Int
default = 1
"--weak", "-w"
help = "Whether to remove weak mutation types. If unsure, use default"
arg_type = Int
default = 0
"--slurm"
help = "Whether to run the pipeline using SLURM"
action = :store_true
end
return parse_args(s)
end
parsed_args = ParseCommands()
numberWorkers = parsed_args["workers"]
println("Adding workers...")
if parsed_args["slurm"]
ClusterManagers.addprocs_slurm(numberWorkers)
else
addprocs(numberWorkers)
end
# get the path where the script is located
@eval @everywhere path_root=@__DIR__
# add all the necessary scripts
include(string(path_root, "/src/removeWeak.jl"))
include(string(path_root,"/src/addWeak.jl"))
include(string(path_root,"/src/filterOutIterations.jl"))
include(string(path_root,"/src/evaluateStability.jl"))
include(string(path_root,"/src/deconvoluteMutCounts.jl"))
include(string(path_root,"/src/extractSignaturesParallelNoArray.jl"))
include(string(path_root,"/src/removeAllSingleSignatures.jl"))
include(string(path_root,"/src/removeOneSig.jl"))
include(string(path_root,"/src/evalSingleSample.jl"))
include(string(path_root,"/src/stabilities.jl"))
# mutation file
mutation_file = parsed_args["mutation_file"]
# number of signatures to extract
min_numberProcessesToExtract = parsed_args["min_signatures_to_extract"]
max_numberProcessesToExtract = parsed_args["max_signatures_to_extract"]
# total number of iterations
totalIterations = parsed_args["iterations"]
# outpath where they will be saved
outpath = parsed_args["outpath"]
# define whether is a Pan whole-genome, so weak types won't be removed
isWeak = parsed_args["weak"]
remove_weak_muttypes=0.001
if isWeak==0
remove_weak_muttypes=0.00
end
# add seed so that the extraction is reproducible
Random.seed!(123456)
totalReplicates = 100
removeLastPercentage = 0.07
# Load input
genomesOriginal, sample_names = readdlm(mutation_file, '\t', Int, '\n'; header=true)
sample_header = vec(Symbol.(sample_names))
# Deconvolute mutation counts
genomes, sort_idx, n_types_to_remove = removeWeak(genomesOriginal, remove_weak_muttypes)
# start processing
name_ttype = splitext(basename(mutation_file))[1]
@info "Processing $name_ttype"
@info "Outpath $outpath"
if parsed_args["slurm"]
@info "Slurm mode"
end
# Bootstrap variables
@eval @everywhere parallelGenomes=$genomes
@eval @everywhere originalGenomes=$genomesOriginal
@eval @everywhere parallelTotalIterations=$totalIterations
@eval @everywhere OutPath=$outpath
@eval @everywhere mutfile = $name_ttype
@eval @everywhere initial_mutfile = $mutation_file
@eval @everywhere SavePath = joinpath(OutPath, mutfile)
mkpath(SavePath)
@everywhere parallelGenomesColsSum = sum(parallelGenomes, dims=1)
@everywhere parallelNormGenomes = parallelGenomes ./ repeat(parallelGenomesColsSum, size(parallelGenomes, 1), 1 )
for numberProcessesToExtract = min_numberProcessesToExtract : max_numberProcessesToExtract
@eval @everywhere parallelTotalProcesses=$numberProcessesToExtract
@info "Extracting $numberProcessesToExtract mutational signatures for $totalIterations iterations using $numberWorkers cores"
#signature extraction
@time processes, exposures, processStab, processStabAvg, exposures_fitting, avgReconstructionError, avgReconstructionErrorPercentage, avgStability, reconError, reconErrorPercentage, simError = deconvoluteMutCounts(numberProcessesToExtract, totalIterations, sort_idx, n_types_to_remove)
# save outfiles for each signature extracted
file = string(SavePath, "/exposures_", parallelTotalProcesses)
CSV.write(file, DataFrame(exposures), header = sample_header, delim = "\t")
file = string(SavePath, "/processes_", parallelTotalProcesses)
CSV.write(file, DataFrame(processes), delim = "\t")
file = string(SavePath, "/processesStabAvg_", parallelTotalProcesses)
processes_stab = convert(DataFrame, processStabAvg)
CSV.write(file, processes_stab, delim = "\t")
file = string(SavePath, "/exposures_fitting_", parallelTotalProcesses)
CSV.write(file, DataFrame(exposures_fitting), header = sample_header, delim = "\t")
file = string(SavePath, "/reconError_", parallelTotalProcesses)
CSV.write(file, DataFrame(reconError'), header = sample_header, delim="\t")
file = string(SavePath, "/reconErrorPercentage_", parallelTotalProcesses)
CSV.write(file, DataFrame(reconErrorPercentage'), header = sample_header, delim="\t")
file = string(SavePath, "/simError_", parallelTotalProcesses)
CSV.write(file, DataFrame(simError'), header = sample_header, delim="\t")
file = string(SavePath, "/avgReconstructionError_", parallelTotalProcesses)
writedlm(file, avgReconstructionError, "\t")
file = string(SavePath, "/avgReconstructionErrorPercentage_", parallelTotalProcesses)
writedlm(file, avgReconstructionErrorPercentage, "\t")
file = string(SavePath, "/avgStability_", parallelTotalProcesses)
writedlm(file, avgStability, "\t")
end
println("Removing workers...")
if numberWorkers>1
for i in workers()
rmprocs(i)
end
end