-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
hwu30
authored and
hwu30
committed
Aug 16, 2019
0 parents
commit 955a6e1
Showing
12 changed files
with
1,119 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
Package: Wind | ||
Title: Wind: weighted indexes for clustering evaluation | ||
Version: 0.9.1 | ||
Date: 2019-08-16 | ||
Author: Zhijin Wu <[email protected]>, Hao Wu <[email protected]> | ||
Maintainer: Zhijin Wu <[email protected]>, Hao Wu <[email protected]> | ||
Depends: R (>= 3.5), infotheo, matrixStats | ||
Suggests: BiocStyle, knitr, rmarkdown | ||
Description: This package provide two new metrics for evaluating cell clustering results | ||
from single cell RNA-seq data: weighted Rand index (wRI) | ||
and weighted normalized mutual information (wNMI). The new metrics | ||
take into account the hierarchical structure of cell types | ||
in evaluating the clustering result. | ||
License: GPL | ||
VignetteBuilder: knitr | ||
biocViews: Sequencing, RNASeq, SingleCell | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
export( | ||
wNMI, | ||
wRI, | ||
createWeights, | ||
createRef | ||
) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
######################################################### | ||
## functions for weighted normalized mutual information | ||
######################################################### | ||
|
||
## This function creates reference structure, | ||
## given an expression matrix and true cell classes | ||
createRef <- function(Y, classes) { | ||
## normalize by total | ||
ss = colMeans(Y) | ||
ss = ss / median(ss) | ||
Ynorm = sweep(Y, 2, ss, FUN="/") | ||
## get cluster means | ||
allClasses = unique(classes) | ||
mY = matrix(0, nrow=nrow(Y), ncol=length(allClasses)) | ||
for(i in 1:length(allClasses)) { | ||
ix = classes == allClasses[i] | ||
mY[,i] = rowMeans(Ynorm[,ix]) | ||
} | ||
colnames(mY) = allClasses | ||
|
||
## select some marker genes to hclust | ||
ix = selectMarker(mY) | ||
mY2 = mY[ix,] | ||
hc = hclust(dist(t(log(mY2+1)))) | ||
|
||
## create output | ||
weights = rev(hc$height) / max(hc$height) | ||
Ref = matrix(0, nrow=length(allClasses), ncol=length(allClasses)-1) | ||
for(i in 2:length(allClasses)) | ||
Ref[,i-1] = cutree(hc,k=i) | ||
rownames(Ref) = allClasses | ||
|
||
list(Ref=Ref, weights=weights, hc=hc) | ||
|
||
} | ||
|
||
|
||
## function to select marker genes for creating reference structure | ||
## I'm using the ones with the largest variance (of log counts), and the mean greater than 15 | ||
selectMarker <- function(YY) { | ||
rr = matrixStats::rowVars(log(YY+1)) | ||
## rr = rowMeans(mY) | ||
flag = rep(FALSE, length(rr)) | ||
ix = order(rr, decreasing=TRUE)[1:1000] ## select 1000 or so | ||
flag[ix] = TRUE | ||
flag[rowMeans(YY)<15] = FALSE | ||
ix = which(flag) | ||
ix | ||
} | ||
|
||
|
||
## Weighte normalized mutual information | ||
## Given the celltype structure, true classes and cluster result | ||
## Note trueclass don't have to be the groud truth. | ||
## It can just be a clustering of cells | ||
wNMI <- function(ctStruct, trueclass, cluster, use.weight=TRUE) { | ||
Ref = ctStruct$Ref | ||
if(use.weight) | ||
weights = ctStruct$weights | ||
else | ||
weights = rep(1, length(ctStruct$weights)) | ||
|
||
## create extended reference data with the same number of rows as Y | ||
X = Ref[trueclass,] | ||
X1 = cbind(1, X) | ||
HX = rep(NA,ncol(X)) | ||
for(i in 2:ncol(X1)) { | ||
HX[i-1] = condentropy(as.factor(X1[,i]), as.factor(X1[,i-1]) ) | ||
} | ||
|
||
HX.Y = apply( X, 2, function(xxx) condentropy(as.factor(xxx),cluster) ) | ||
HX.Y = diff(c(0,HX.Y)) | ||
HX0 = sum(HX) | ||
HY0 = entropy(cluster) | ||
MI1 = (sum(HX*weights)-sum(HX.Y*weights)) / sum(HX*weights) #fraction of HX* explained by Y | ||
NMI1 = MI1*HX0*2 / (HX0+HY0) | ||
NMI1 | ||
} | ||
|
||
## adjusted entropy | ||
aH <- function(X,d=rep(1,ncol(X))) { | ||
X1 = cbind(rep(1,nrow(X)),X) | ||
HX = rep(NA,ncol(X)) | ||
|
||
for(i in 2:ncol(X1)){ | ||
HX[i-1] = condentropy(X1[,i],X1[,i-1]) | ||
} | ||
sum(HX*d) | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
############################################ | ||
## functions for weighted Rand Index (wRI) | ||
############################################ | ||
|
||
## This function computes the weights used for weighted Rand Index, | ||
## given an expression matrix and true cell classes | ||
createWeights <- function(Y, classes) { | ||
## normalize by total | ||
ss = colMeans(Y) | ||
ss = ss / median(ss) | ||
Ynorm = sweep(Y, 2, ss, FUN="/") | ||
|
||
## get cluster means | ||
allClasses = unique(classes) | ||
mY = matrix(0, nrow=nrow(Y), ncol=length(allClasses)) | ||
for(i in 1:length(allClasses)) { | ||
ix = classes == allClasses[i] | ||
mY[,i] = rowMeans(Ynorm[,ix]) | ||
} | ||
colnames(mY) = allClasses | ||
|
||
## select some marker genes - this step is tricky | ||
ix = selectMarker(mY) | ||
mY2 = mY[ix,] | ||
|
||
## W1 is the correlation of cluster centers | ||
W1 = cor(mY2) ## note it can be negative | ||
|
||
## W0 is the average within group variances | ||
W0 = matrix(1, nrow=nrow(W1), ncol=ncol(W1)) | ||
rownames(W0) = rownames(W1) | ||
colnames(W0) = colnames(W1) | ||
for( i in 1:nrow(W1) ) { | ||
ix = classes==allClasses[i] | ||
tmp = cor(Y[,ix]) | ||
diag(tmp) = NA | ||
W0[i,i] = 1- mean(tmp, na.rm=TRUE) | ||
} | ||
|
||
list(W0=W0, W1=W1) | ||
} | ||
|
||
|
||
## The main Weighted Rand Index function | ||
wRI <- function(trueclass, cluster, w0=NULL, w1=NULL) { | ||
J = length(unique(trueclass)) #number of true classes | ||
|
||
if (is.null(w0)) { | ||
w0 = 1-diag(J) | ||
colnames(w0) = names(table(trueclass)) | ||
} # credit for keeping celltype j1 and j2 separate | ||
if (is.null(w1)) { | ||
w1 = diag(J) | ||
colnames(w1) = names(table(trueclass)) | ||
} | ||
|
||
T1 = table(cluster, trueclass) | ||
T1 = T1[,colnames(w0)] | ||
T0 = table(trueclass, trueclass) | ||
T0 = T0[colnames(w0), colnames(w0)] | ||
|
||
S1 = GetScore(T1, w0=w0, w1=w1) | ||
S0 = GetScore(T0, w0=w0, w1=w1) | ||
wRI1 = sum(S1[1:4])/sum(S0[1:4]) ## weighted Rand Index | ||
|
||
NI1 = (S1["N11"]+S1["N01"])/(S1["N11"]+S1["d0"]) #adjusted positive predicative value | ||
NI2 = (S1["N00"]+S1["N10"])/(S1["b0"]+S1["c0"]) #adjusted negative predicative value | ||
n = length(trueclass) | ||
output = c("wRI"=wRI1,"NI1"=NI1,"NI2"=NI2,"p1"=(S1["N11"]+S1["d0"])/choose(n,2), | ||
"p0"=(S1["b0"]+S1["c0"])/choose(n,2)) | ||
names(output) = c("wRI","NI1","NI2","p1", "p0") | ||
output | ||
|
||
} | ||
|
||
GetScore <- function(T1, w0, w1) { | ||
n = sum(T1) | ||
J = ncol(T1) | ||
I = nrow(T1) | ||
|
||
a = sum(choose(T1,2)) # friends remain friends (N11) | ||
c = d = 0 # c:friends separated (N10); d: foes called friends (N01) | ||
b0 = c0 = d0 = 0 | ||
|
||
for(i in 1:I){ | ||
for( j in 1:J) { | ||
c = c + sum(T1[i,j]*T1[-i,j]*w0[j,j])/2 #partial score in N10 | ||
d = d + sum(T1[i,j]*T1[i,-j]*w1[j,-j])/2 #partial score in N01 | ||
|
||
b0 = b0 + sum(T1[i,j]*colSums(matrix(T1[-i,-j],ncol=J-1)))/2 | ||
c0 = c0 + sum(T1[i,j]*T1[-i,j])/2 | ||
d0 = d0 + sum(T1[i,j]*T1[i,-j])/2 | ||
} | ||
} | ||
c(N11=a,N00=b0,N10=c,N01=d,b0=b0,c0=c0,d0=d0) | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
\name{createRef} | ||
\alias{createRef} | ||
\title{ | ||
Create reference hierarchy for computing weighted normalized mutual | ||
information (wNMI) | ||
} | ||
|
||
\description{ | ||
This function takes an expression matrix and true cell classes, and | ||
returns two square (JxJ) matrices, where J is the | ||
number of classes provided by true cell class. | ||
|
||
} | ||
\usage{ | ||
createRef(Y, classes) | ||
} | ||
%- maybe also 'usage' for other objects documented here. | ||
\arguments{ | ||
\item{Y}{The expression matrix. Rows are genes, columns are cells.} | ||
\item{classes}{True cell classes.} | ||
} | ||
|
||
\details{ | ||
The weights used in wNMI are computed based on the heights of the branches | ||
of the hierarchical tree of different cell types. We first compute mean | ||
expression profiles for all cell types, and then select 1000 marker | ||
genes using the same procedure as described in 'createWeights' function. A hierarchical tree | ||
is then constructed based on the mean expression from marker genes. We | ||
then obtain the tree height (from the bottom) at each branching | ||
point. The ratios of these heights to the maximum tree height are used | ||
as the weight in computing wNMI. | ||
} | ||
|
||
\value{ | ||
A list with following fields: | ||
\item{Ref}{A matrix for the reference cell type clustering when | ||
cutting the full hierarchical tree at different branching point. Each column | ||
represents a cut. For example, if a column is [1,1,1,2,2,3], it | ||
means the first 3 cell types are deemed in the same cluster, the | ||
next two are in another cluster, and the last cell type is in a | ||
cluster by itself. | ||
} | ||
\item{weights}{A vector for the relative tree heights at different | ||
cut. This will be used as weights in computing wNMI.} | ||
\item{hc}{An object of class 'hclust'. This is the hierarchical tree | ||
of the reference cell types.} | ||
} | ||
|
||
\author{ | ||
Zhijin Wu <zhijin_wu@brown.edu>, Hao Wu <hao.wu@emory.edu> | ||
} | ||
|
||
|
||
\seealso{ | ||
|
||
} | ||
\examples{ | ||
data(Zhengmix8eq) | ||
ctStruct = createRef(Y, trueclass) | ||
plot(ctStruct$hc) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
\name{createWeights} | ||
\alias{createWeights} | ||
|
||
\title{ | ||
Create weights used for computing weighted Rand Index (wRI) | ||
|
||
} | ||
\description{ | ||
This function takes an expression matrix and true cell classes, and | ||
returns two square (JxJ) matrices, where J is the | ||
number of classes provided by true cell class. | ||
} | ||
|
||
\usage{ | ||
createWeights(Y, classes) | ||
} | ||
|
||
\arguments{ | ||
\item{Y}{The expression matrix. Rows are genes, columns are cells.} | ||
\item{classes}{True cell classes.} | ||
} | ||
|
||
\details{ | ||
There are two weight matrices W1 (score for putting two cells in the | ||
same cluster) and W0 (score for separating two cells in different | ||
clusters). We set W1 has diagonal values 1 and off-diagonal less than | ||
1. To obtain off-diagonal values entry (i,j) in W1, we compute the | ||
mean expression profiles for all cell types from single-cell data, and | ||
then use the Pearson’s correlation of mean expression from cell types | ||
i and j as W1[i,j]. Note that in general, correlations among | ||
expression from different cell types are high because a majority of | ||
genes are not differentially expressed among cell types. To make W1 | ||
scores more distinctive, we take a marker selection step to pick the | ||
top 1000 genes with the largest variances of log expressions. The | ||
Pearson’s correlation of mean expression from these genes are then | ||
taken as the off-diagonal values for W1[i,j]. | ||
|
||
We make W0 has off diagonal values 1 and diagonal between 0 and 1, | ||
reflecting that keeping the | ||
separation existing in the reference receives full credit, but | ||
breaking a (weak) tie may not reduce the score completely to 0. We | ||
compute W0[i,i] based on the inter-cellular expression variances within | ||
cell type i. To be specific, we take expressions for all cells in cell | ||
type i and compute their Pearson’s correlations. W0[i,i] is defined as | ||
1 minus the average inter-cellular Pearson’s correlations from all | ||
cells. Thus, for a tight cluster where the within cell type | ||
correlation is high, the score will be closer to 0, indicating | ||
stronger penalty for separating cells for such a cluster. On the other | ||
hand, for a loose cluster where the within cell type correlation is | ||
low, the score will be larger, indicating weaker penalty. | ||
} | ||
|
||
\value{ | ||
A list with following fields: | ||
\item{W0}{A square matrix of dimension JxJ, where J is the | ||
number of classes provided by true cell class. The (i,j)-th entry is | ||
the score of separating two cells of types i and j in different | ||
clusters.} | ||
\item{W1}{A square matrix of dimension JxJ, where J is the | ||
number of classes provided by by true cell class The(i,j)-th entry is | ||
the score for putting two cells of different types in the same | ||
cluster.} | ||
} | ||
|
||
\author{ | ||
Zhijin Wu <zhijin_wu@brown.edu>, Hao Wu <hao.wu@emory.edu> | ||
} | ||
|
||
\seealso{ | ||
wRI | ||
} | ||
|
||
\examples{ | ||
data(Zhengmix8eq) | ||
weights = createWeights(Y, trueclass) | ||
} |
Oops, something went wrong.