-
Notifications
You must be signed in to change notification settings - Fork 2
/
SC1_py_sctree.py
195 lines (176 loc) · 9.97 KB
/
SC1_py_sctree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# required packages
import os,sys
import numpy as np
import pandas as pd
from datetime import datetime as dt_
from SP1_SCT_UTIL import *
def main():
### Initialing variables #####################################################################
##############################################################################################
sys.setrecursionlimit(20000)
usage = "usage: python %prog <-P path> <-I input> <-D datatype>"
description = """Input integer copy number profile, diploid conversion should be done prior to this run.
Columns correspond to cell IDs and Rows correspond to genomic regions (scDNA) or genes (scRNA)."""
op = OptionParser(version="%prog 1.0", description=description, usage=usage, add_help_option=False)
op.add_option("-h", "--help",action="help",
help="Show this help message and exit.")
op.add_option("-P", "--Path",dest="Path",type="str",
help="Path to the script")
op.add_option("-I", "--Input",dest="Input",type="str",
help="Input file")
op.add_option("-G", "--Genome",dest="Genome",type="str",
help="Genome version hg19 or hg38")
op.add_option("-O", "--Output",dest="Output",type="str",
help="Output path. Default: input name + runtime")
op.add_option("-D", "--Datatype",dest="Datatype",type="str",
help="The type of input data. Either D (DNA-seq) or R (RNA-seq).")
op.add_option("-W", "--Windows",dest="Windows",type="str",
help="the number of genes to merge when the input data type is R. Default 30.")
op.add_option("-R", "--Permutation",dest="Permutation",type="str",
help="""Whether reconstructed permuted tree (T) or not (F).
If not, permuted copy number profile will be used to perform LSA.
Default value is F due to time cost.""")
(options,args) = op.parse_args()
# check input parameters. Package path, input file, data type and genome version are required.
if not options.Path or not options.Input or not options.Datatype or not options.Genome:
op.print_help() ; sys.exit(1)
# get the input parameters
PCKAGE_PATH = options.Path
IN_CNV_PATH = options.Input
IN_CNV_FILE = IN_CNV_PATH.split("/")[-1][:-4]
NUCLEC_ACID = options.Datatype
REF__GENOME = options.Genome
GENE_BIN_SZ = options.Windows if options.Windows else "30"
rt = dt_.now() ; rt = f"{rt.year}_{rt.month:0>2}{rt.day:0>2}_{rt.hour:0>2}{rt.minute:0>2}"
OUTPUT_PATH = options.Output if options.Output else f"{IN_CNV_PATH[:-4]}_{rt}"
if not options.Permutation: permutation = "F"
else: permutation = options.Permutation
# get the input parameters
PCKAGE_PATH = getPath(PCKAGE_PATH).replace("//", "/")
IN_CNV_PATH = getPath(IN_CNV_PATH).replace("//", "/")
OUTPUT_PATH = getPath(OUTPUT_PATH).replace("//", "/")
GENPOS_PATH = f"{PCKAGE_PATH}/genomes/gencode_v{REF__GENOME[-2:]}_gene_pos.txt"
BANDBD_PATH = f"{PCKAGE_PATH}/genomes/{REF__GENOME}.band.bed"
# supplementary files paths
DE_DUP_PATH = f"1_{IN_CNV_FILE}_dedup.csv"
DUPREF_PATH = f"1_{IN_CNV_FILE}_dup_ref.csv"
SEGCNV_PATH = f"2_{IN_CNV_FILE}_bin_{GENE_BIN_SZ}.csv"
SCTREE_PATH = f"3_CNV.tree.txt"
os.system("mkdir -p " + OUTPUT_PATH)
### Data reforming ###########################################################################
### change the input copy number profile to the matrix format used to infer Tree ###
### if the data type = R, estimate integer copy number by averaging adjacent genes. ###
### Default number of genes is 30. ###
##############################################################################################
# import cell CNV data
print("\n#####################################################")
print("### now running SC1_py_sctree.py ###")
print("#####################################################\n")
os.chdir(OUTPUT_PATH)
print(f"All intermediate files and output files will be stored in {OUTPUT_PATH}")
print("reading data")
df_ori = pd.read_csv(IN_CNV_PATH, sep="\t", index_col=0)
df_ori.columns = df_ori.columns.str.replace("[\ \-\.]", "_", regex=True)
# deduplication
cel_dup = df_ori.T.duplicated(keep="first")
cel_uni = cel_dup[~cel_dup].index.tolist() # unique cell list
cel_dup = cel_dup[ cel_dup].index.tolist() # duplicated cell list
# find duplication relationship
print("running deduplication")
dup_relationship = {"par_cell":[], "dup_cell":[]}
for dup_cell in cel_dup:
for uni_cell in cel_uni:
if df_ori[uni_cell].equals(df_ori[dup_cell]): # find cells of origin
dup_relationship["par_cell"].append(uni_cell)
dup_relationship["dup_cell"].append(dup_cell)
break
cell_dup_ref = pd.DataFrame(dup_relationship)
if cell_dup_ref.shape[0]>0:
# save duplication information
df_ded = df_ori.loc[:,cel_uni]
#df_ded = df_ded.iloc[:,np.random.choice(range(df_ded.shape[1]), size=min(df_ded.shape[1], 100), replace=False)]
print(f"""{df_ded.shape[1]}/{df_ori.shape[1]} cells remained after deduplication.
duplication relationship is stored in {DUPREF_PATH} and
dedup-data saved in {DE_DUP_PATH}, which will be used in the following analysis""")
df_ded.to_csv(DE_DUP_PATH, sep="\t")
cell_dup_ref.to_csv(DUPREF_PATH, sep="\t")
else:
print("no duplicating cells found, good!")
DE_DUP_PATH = IN_CNV_PATH
df_ded = df_ori
print(f"converting CN profile to segmental CN level:\n")
command = f"Rscript {PCKAGE_PATH}/SC2_RR_dataTransfer.R {OUTPUT_PATH} {DE_DUP_PATH} {NUCLEC_ACID} {SEGCNV_PATH}" # input DNA
if NUCLEC_ACID == "R": command = f"{command} {GENPOS_PATH} {GENE_BIN_SZ}" # input RNA
os.system(command) ; print(command)
### MEDLAT tree inference ####################################################################
### reformed segmental data will be used to infer MEDLAT single cell tree ###
### where a diploid cell will be used as the root (imputed if not existing) ###
##############################################################################################
nodes, root = read_CNV(SEGCNV_PATH)
node_list = nodes.keys()
#calculation of MED distance
print("\n#####################################################")
print("### going back to SC1_py_sctree.py ###")
print("#####################################################\n")
print("initializing tree")
tree_dict = create_tree(nodes, node_list, root, df_cor=None, len_threshold=30)
# set df_cor to None and leave proximity to True if no spatial coordinate information is provided
# this will automatically calculate pairwise MED instead of only connecting cells within close proximity
print("computing rdmst")
tree = compute_rdmst(tree_dict, root)[0]
with open(SCTREE_PATH,'w') as write:
write.write("\t".join(["stt", "end", "len"])+"\n") # header line
for in_node in tree.keys():
for ot_node in tree[in_node].keys():
write.write("\t".join([in_node, ot_node, str(tree[in_node][ot_node])])+"\n")
print(f"MEDALT inferrence finish, weighted tree saved to:{SCTREE_PATH}")
#Permutation process for lineage speciation analysis (LSA)
### LSA test #################################################################################
### lineage speciation analysis can be done with/without permutation. ###
### with permutation (T), both copy number profiles and single cells labels are shuffled ###
### to repeat above inference process and be used to estimate LSA result significance. ###
### otherwise (permu==F), only shuffle single cell labels to estimate significance. ###
##############################################################################################
if permutation == "T":
PERMUT_PATH = OUTPUT_PATH + "/permutation"
print("Reconstructing tree based on permutation data.")
print("This will take a long time! Please have some coffee.")
#permute copy number profile
step_3_cmd = f"Rscript {PCKAGE_PATH}/SC3_RR_Permutation.R {SCTREE_PATH} "
step_3_cmd = step_3_cmd + f"{IN_CNV_PATH} {NUCLEC_ACID} {OUTPUT_PATH}/permutation"
if datatype == "D":
os.system(step_3_cmd)
elif datatype == "R":
os.system(step_3_cmd + " "+ GENE_BIN_SZ + " " + REF__GENOME)
#Infer permutation tree
for j in range(1,101):
permutefile=permutationPath+"/permute."+str(j)+".CNV.txt"
(nodes,root) = read(permutefile)
node_name_list = nodes.keys()
g = create_tree(nodes, node_name_list,root)
result = compute_rdmst(g, root)
permuteTree=permutefile+".celltree.txt"
write=open(permuteTree,'w')
tree=result[0]
out1="from"+"\t"+"to"+"\t"+"dist"
write.write(out1)
for ele in tree.keys():
out=ele
for value in tree[ele].keys():
out1=out+"\t"+value+"\t"+str(tree[ele][value])
print >> write,out1
write.close()
print("Pemutation tree finish.")
elif permutation == "F": PERMUT_PATH = ""
#Identifying CNAs associated with cellular lineage expansion.
print("Performing LSA.")
cmd = f"Rscript {PCKAGE_PATH}/SC4_RR_LSA_tree.R {PCKAGE_PATH} {IN_CNV_PATH} {OUTPUT_PATH} {SCTREE_PATH} {SEGCNV_PATH} {NUCLEC_ACID} {REF__GENOME} {PERMUT_PATH}"
print(cmd)
os.system(cmd)
print("All is done!")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt me, see you!\n")
sys.exit(0)