-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrunner.py
148 lines (139 loc) · 7.22 KB
/
runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import sys
import time
import multiprocessing
from os.path import dirname, basename
script_dir = sys.argv[0]
sys.path.append(os.path.dirname(script_dir))
from pfithic import *
from my_parser import parse_args
def align_files(reads, frags):
readsfiles = [f for f in os.listdir(reads) if f.find('count') > 0]
fragsfiles = [f for f in os.listdir(frags) if f.find('frags') > 0]
assert len(readsfiles) == len(fragsfiles), 'readsfile and fragsfile must be paired!'
chr_reads = set(f.split('_')[0] for f in readsfiles)
chr_frags = set(f.split('_')[0] for f in readsfiles)
assert len(chr_frags-chr_reads) == 0, 'readsfile and fragsfile must be paired in chromosomes!'
lib_reads = set(f.split('_')[1] for f in readsfiles)
lib_frags = set(f.split('_')[1] for f in readsfiles)
assert len(lib_frags-lib_reads) == 0, 'readsfile and fragsfile must be paired in libnames!'
chr_list = sorted(list(chr_reads))
lib_list = sorted(list(lib_reads))
files_dict = {}.fromkeys(chr_list)
for chrn in chr_list:
temp_dict = {}.fromkeys(lib_list)
for lib in lib_list:
temp_dict[lib] = [os.path.join(reads, f'{chrn}_{lib}_count.gz'), os.path.join(reads, f'{chrn}_{lib}_frags.gz')]
files_dict[chrn] = temp_dict
return files_dict, chr_list, lib_list
def fithic_workflow(outdir, readsFile, fragsFile, resolution, distLowThres, distUpThres, noOfBins, mappThres, contactType, noOfPasses, libname, biasFile=None, biasLowerBound=0.5, biasUpperBound=2, logger=False):
print(f'Will read {dirname(readsFile)} [{basename(readsFile)}] and [{basename(fragsFile)}]')
if logger:
f = open(os.path.join(outdir, f'zlog_{libname}.md'), 'w')
sys.stdout = f
print(f"GIVEN FIT-HI-C ARGUMENTS, LIB: {libname}")
print("==============================")
print(f"Output path will be: '{outdir}'")
print(f"Reading single interactions file from: {readsFile}")
print(f"Reading single fragments file from: {fragsFile}")
if biasFile is not None:
print(f"Reading bias file from {biasFile}")
assert biasUpperBound > biasLowerBound, "Error: bias lower bound is greater than bias upper bound. Please fix."
print(f"Bound of bias values is ({biasLowerBound}, {biasUpperBound})")
if resolution == 0:
print("Fixed size data not being used.")
elif resolution > 0:
print("Fixed size option detected, Fast version of FitHiC will be used")
print(f"Resolution is {resolution//1000} kb")
else:
print("INVALID RESOLUTION ARGUMENT DETECTED")
print("Please make sure the given resolution is a positive number greater than zero")
print("User-given resolution: %s" % resolution)
sys.exit(2)
print(f"Disired Distance threshold is ({distLowThres}, {distUpThres}]")
print(f"The maximal number of bins is {noOfBins}")
print(f"The minimal number of reads required for valid interaction is {mappThres}")
region_parser(contactType)
print(f"Start FitHiC now...[{noOfPasses} Pass totally]")
print("==============================\n")
outliersline, outliersdist = None, None
splineFDRx = {}.fromkeys(range(1, noOfPasses+1))
splineFDRy = {}.fromkeys(range(1, noOfPasses+1))
X = {}.fromkeys(range(1, noOfPasses+1))
Y = {}.fromkeys(range(1, noOfPasses+1))
Yerr = {}.fromkeys(range(1, noOfPasses+1))
splineXs = {}.fromkeys(range(1, noOfPasses+1))
splineYs = {}.fromkeys(range(1, noOfPasses+1))
for passNo in range(1, noOfPasses+1):
outfile = os.path.join(outdir, f'{libname}.pass{passNo}_fithic')
splinefile = os.path.join(outdir, f'{libname}.pass{passNo}_spline')
print(f"FIT-HI-C PROCESSING: {passNo}-TH PASS")
print("==============================")
allReads = read_countsFile(readsFile, distLowThres, distUpThres, outliersline)
mainDic, observedInterAllSum, observedIntraAllSum, observedIntraInRangeSum = get_maindic(allReads, distLowThres, distUpThres)
print("")
binStats, bins = make_bins(mainDic, noOfBins, observedIntraInRangeSum, outliersdist)
print("")
allFrags = read_fragsFile(fragsFile, resolution=resolution)
binStats, possibleIntraInRangeCount, possibleInterAllCount, interChrProb = make_fragpairs(binStats, allFrags, distLowThres, distUpThres, observedIntraInRangeSum, resolution)
print("")
x, y, yerr = calculateProbabilities(outfile, mainDic, binStats, observedIntraInRangeSum)
print("")
splineX, splineY, residual = fit_Spline(splinefile, mainDic, x, y, yerr, passNo)
print("")
outliesline, outliersdist, FDRx, FDRy = calculateSignificant(splinefile, readsFile, splineX, splineY, possibleIntraInRangeCount, observedIntraInRangeSum, possibleInterAllCount, observedInterAllSum, distLowThres, distUpThres, passNo, region=contactType)
print("One pass finished!")
print("==============================\n")
splineFDRx[passNo] = FDRx
splineFDRy[passNo] = FDRy
X[passNo] = x
Y[passNo] = y
Yerr[passNo] = yerr
splineXs[passNo] = splineX
splineYs[passNo] = splineY
if noOfPasses > 1:
comparefdr = os.path.join(outdir, f'{libname}.{noOfPasses}passes_fdr')
print(f'>>>> Ploting to {comparefdr}.svg')
compare_Spline_FDR(comparefdr, splineFDRx, splineFDRy)
comparespline = os.path.join(outdir, f'{libname}.{noOfPasses}passes_spline')
print(f'>>>> Ploting to {comparespline}.svg')
compareFits_Spline(comparespline, splineXs, splineYs, X, Y, Yerr)
if logger:
sys.stdout = sys.__stdout__
f.close()
def main(arguments):
pool_num = multiprocessing.cpu_count()
args = parse_args(arguments)
outdir = args.outdir
reads = args.intersfile
frags = args.fragsfile
resolution = args.resolution
distLowThres = args.distLowThres
distUpThres = args.distUpThres
noOfBins = args.noOfBins
mappThres = args.mappabilityThreshold
contactType = args.contactType
noOfPasses = args.noOfPasses
logger = args.logger
if os.path.isfile(reads):
os.makedirs(outdir, exist_ok=True)
libname = args.libname
fithic_workflow(outdir, reads, frags, resolution, distLowThres, distUpThres, noOfBins, mappThres, contactType, noOfPasses, libname, logger=logger)
elif os.path.isdir(reads):
files_dict, chr_list, lib_list = align_files(reads, frags)
start = time.time()
print(f'Start a multiprocess pool with processes = {pool_num} for pfithic analysis')
pool = multiprocessing.Pool(processes=pool_num)
print(f"Files: [{','.join(chr_list)}] x [{','.join(lib_list)}]")
for chrn in chr_list:
outdir_new = os.path.join(outdir, chrn)
os.makedirs(outdir_new, exist_ok=True)
for libname in lib_list:
reads = files_dict[chrn][libname][0]
frags = files_dict[chrn][libname][1]
pool.apply_async(fithic_workflow, (outdir_new, reads, frags, resolution, distLowThres, distUpThres, noOfBins, mappThres, contactType, noOfPasses, libname,), {'logger': logger})
pool.close()
pool.join()
print(f'All processing done. Running cost is {(time.time()-start)/60:.1f} min')
if __name__ == "__main__":
main(sys.argv[1:])