-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRunClustering.py
64 lines (51 loc) · 2.12 KB
/
RunClustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import scipy.cluster.hierarchy as hcl
from scipy.spatial.distance import squareform
import numpy as np
import argparse
# Transform output file to a distance matrix
def LongToMatrix(geneList, lrFile):
disDF = pd.DataFrame(0, index=geneList, columns=geneList)
for i in range(len(geneList)):
gi = geneList[i]
giRows = lrFile[lrFile.gene1 == gi][['gene2', 'lr']]
if giRows.shape[0] > 0:
disDF.loc[gi, giRows.gene2] = giRows.lr.values
return disDF
# likelihood ratio matrix to distance matrix
def ToDistanceMatrix(disDF):
disMatrix = disDF.as_matrix()
disMatrix = disMatrix + np.transpose(disMatrix)
disMatrix = disMatrix.max() - disMatrix
np.fill_diagonal(disMatrix, 0)
return disMatrix
# Construct the condensed distance matrix for linkage function
def ToCondensedMatrix(disMatrix):
# condense data format for hierarchical clustering
disArray = squareform(disMatrix)
return disArray
# Run hierarchical clustering
def RunHclust(t, criterior):
z = hcl.linkage(disArray, method='average')
clusterLabels = hcl.fcluster(z, t, criterion='distance')
return clusterLabels
# Parse the inputs from command line
parser = argparse.ArgumentParser(description='Run Hierarchical Clustering')
parser.add_argument("-c", dest="criterion", type=str,
help="choose one of criteria: distance, maxclust, inconsistent...")
parser.add_argument("-v", dest="value", type=float,
help="value for criterion")
args = parser.parse_args()
t = args.value
criterion = args.criterion
# Read Likelihood ratio output file
lrFile = pd.read_csv("./LR_outputs.txt", names=["gene1", "gene2", "indep", "dep", "lr"])
# Prepare the data for hierarchical clustering
geneList = np.append(lrFile.gene2.unique(), lrFile.gene1.iloc[-1])
disDF = LongToMatrix(geneList, lrFile)
disMatrix = ToDistanceMatrix(disDF)
disArray = ToCondensedMatrix(disMatrix)
# Run clustering and output
clusterLabels = RunHclust(t, criterion)
clusterResults = pd.DataFrame({'GI': geneList, 'Labels': clusterLabels})
clusterResults.to_csv("./Clustering_outputs.txt", index=False)