-
Notifications
You must be signed in to change notification settings - Fork 0
/
summarize_project.py
executable file
·74 lines (64 loc) · 3.91 KB
/
summarize_project.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/opt/common/CentOS_6-dev/bin/current/python
descr = "Concatenate FACETS out files into tabular per-sample format and cncf files into a long table. Output is OUT and CNCF."
import argparse, subprocess, os, re, glob
parser = argparse.ArgumentParser(description = descr, formatter_class = argparse.RawTextHelpFormatter)
parser.add_argument('-c', '--cncfFiles', help = 'List cncf files', nargs = '+', required = False)
parser.add_argument('-o', '--outFiles', help = 'List out files', nargs = '+', required = False)
parser.add_argument('-s', '--segFiles', help = 'List seg files', nargs = '+', required = False)
parser.add_argument('-p', '--outPrefix', help = 'Prefix output', required = False)
args = parser.parse_args()
### If infiles are not specified, check directory
if args.cncfFiles is None:
cncfFiles = glob.glob('*cncf.txt')
else:
cncfFiles = args.cncfFiles
if args.segFiles is None:
segFiles = glob.glob('*.seg')
else:
segFiles = args.segFiles
if args.outFiles is None:
outFiles = glob.glob('*out')
else:
outFiles = args.outFiles
### Check if outPrefix specified
if args.outPrefix is None:
CNCF = '%s_CNCF.txt' % os.path.basename(os.getcwd())
SEG = '%s.SEG' % os.path.basename(os.getcwd())
OUT = '%s_OUT.txt' % os.path.basename(os.getcwd())
else:
CNCF = '%s_CNCF.txt' % args.outPrefix
SEG = '%s.SEG' % args.outPrefix
OUT = '%s_OUT.txt' % args.outPrefix
### Concatenate cncf files into CNCF.txt
# concatCall = '(cat %s | head -1; cat *cncf.txt | egrep -v "^ID") > %s' % (' '.join(cncfFiles), CNCF)
concatCncf = "awk 'ARGIND == 1 { print; next } /^ID/ { next } { print }' %s > %s" % (' '.join(cncfFiles), CNCF)
subprocess.call(concatCncf, shell = True, stdout = open(os.devnull, 'w'), stderr = subprocess.STDOUT)
### Concatenate seg files into .SEG
concatSeg = "awk 'ARGIND == 1 { print; next } /^ID/ { next } { print }' %s > %s" % (' '.join(segFiles), SEG)
subprocess.call(concatSeg, shell = True, stdout = open(os.devnull, 'w'), stderr = subprocess.STDOUT)
### Read and parse out files
### Write to OUT.txt
header = 'Sample\tFacets\tsnp.nbhd\tndepth\tpurity_cval\tcval\tmin.nhet\tgenome\tPurity\tPloidy\tdipLogR\tloglik\tflags\n'
with open(OUT, 'w') as output:
# Parse out files
output.write(header)
for out in outFiles:
f = open(out, 'r')
content = [x.strip('\n') for x in f.readlines()]
contentDict = dict(item.split("=") for item in content if '=' in item)
sample = [value.strip() for key, value in contentDict.items() if re.search('TAG', key)][0]
facets = [value.strip() for key, value in contentDict.items() if re.search('Facets version', key)][0]
snpnbhd = [value.strip() for key, value in contentDict.items() if re.search('snp.nbhd', key)][0]
ndepth = [value.strip() for key, value in contentDict.items() if re.search('ndepth', key)][0]
puritycval = [value.strip() for key, value in contentDict.items() if re.search('purity_cval', key)][0]
cval = [value.strip() for key, value in contentDict.items() if re.search('# cval', key)][0]
minnhet = [value.strip() for key, value in contentDict.items() if re.search('min.nhet', key)][0]
genome = [value.strip() for key, value in contentDict.items() if re.search('genome', key)][0]
purity = [value.strip() for key, value in contentDict.items() if re.search('Purity', key)][0]
ploidy = [value.strip() for key, value in contentDict.items() if re.search('Ploidy', key)][0]
diplogr = [value.strip() for key, value in contentDict.items() if re.search('dipLogR', key)][0]
loglik = [value.strip() for key, value in contentDict.items() if re.search('loglik', key)][0]
flagsIdx = [i for i,x in enumerate(content) if 'flags' in x]
flags = '|'.join(content[flagsIdx[0]+1:]).strip('#')
output.write('\t'.join([sample, facets, snpnbhd, ndepth, puritycval, cval, minnhet, genome, purity, ploidy, diplogr, loglik, flags]) + '\n')
output.close()