Initial commit

haowulab · Aug 16, 2019 · 955a6e1 · 955a6e1
commit 955a6e1
Show file tree

Hide file tree

Showing 12 changed files with 1,119 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,17 @@
+Package: Wind
+Title: Wind: weighted indexes for clustering evaluation
+Version: 0.9.1
+Date: 2019-08-16
+Author: Zhijin Wu <[email protected]>, Hao Wu <[email protected]>
+Maintainer: Zhijin Wu <[email protected]>, Hao Wu <[email protected]>
+Depends: R (>= 3.5), infotheo, matrixStats
+Suggests: BiocStyle, knitr, rmarkdown
+Description: This package provide two new metrics for evaluating cell clustering results
+	 from single cell RNA-seq data: weighted Rand index (wRI)
+	 and weighted normalized mutual information (wNMI). The new metrics
+	 take into account the hierarchical structure of cell types 
+	 in evaluating the clustering result. 
+License: GPL
+VignetteBuilder: knitr
+biocViews: Sequencing, RNASeq, SingleCell
+
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,7 @@
+export(
+       wNMI,
+       wRI,
+       createWeights,
+       createRef
+       )
+
diff --git a/R/wNMI.R b/R/wNMI.R
@@ -0,0 +1,90 @@
+#########################################################
+## functions for weighted normalized mutual information
+#########################################################
+
+## This function creates reference structure,
+## given an expression matrix and true cell classes
+createRef <- function(Y, classes) {
+    ## normalize by total
+    ss = colMeans(Y)
+    ss = ss / median(ss)
+    Ynorm = sweep(Y, 2, ss, FUN="/")
+    ## get cluster means
+    allClasses = unique(classes)
+    mY = matrix(0, nrow=nrow(Y), ncol=length(allClasses))
+    for(i in 1:length(allClasses)) {
+        ix = classes == allClasses[i]
+        mY[,i] = rowMeans(Ynorm[,ix])
+    }
+    colnames(mY) = allClasses
+
+    ## select some marker genes to hclust
+    ix = selectMarker(mY)
+    mY2 = mY[ix,]
+    hc = hclust(dist(t(log(mY2+1))))
+
+    ## create output
+    weights = rev(hc$height) / max(hc$height)
+    Ref = matrix(0, nrow=length(allClasses), ncol=length(allClasses)-1)
+    for(i in 2:length(allClasses))
+        Ref[,i-1] = cutree(hc,k=i)
+    rownames(Ref) = allClasses
+
+    list(Ref=Ref, weights=weights, hc=hc)
+
+}
+
+
+## function to select marker genes for creating reference structure
+## I'm using the ones with the largest variance (of log counts), and the mean greater than 15
+selectMarker <- function(YY) {
+    rr = matrixStats::rowVars(log(YY+1))
+    ## rr = rowMeans(mY)
+    flag = rep(FALSE, length(rr))
+    ix = order(rr, decreasing=TRUE)[1:1000] ## select 1000 or so
+    flag[ix] = TRUE
+    flag[rowMeans(YY)<15] = FALSE
+    ix = which(flag)
+    ix
+}
+
+
+## Weighte normalized mutual information
+## Given the celltype structure, true classes and cluster result
+## Note trueclass don't have to be the groud truth.
+## It can just be a clustering of cells
+wNMI <- function(ctStruct, trueclass, cluster, use.weight=TRUE) {
+    Ref = ctStruct$Ref
+    if(use.weight)
+        weights = ctStruct$weights
+    else
+        weights = rep(1, length(ctStruct$weights))
+
+    ## create extended reference data with the same number of rows as Y
+    X = Ref[trueclass,]
+    X1 = cbind(1, X)
+    HX = rep(NA,ncol(X))
+    for(i in 2:ncol(X1)) {
+        HX[i-1] = condentropy(as.factor(X1[,i]), as.factor(X1[,i-1]) )
+    }
+
+    HX.Y = apply( X, 2, function(xxx) condentropy(as.factor(xxx),cluster) )
+    HX.Y = diff(c(0,HX.Y))
+    HX0 = sum(HX)
+    HY0 = entropy(cluster)
+    MI1 = (sum(HX*weights)-sum(HX.Y*weights)) / sum(HX*weights) #fraction of HX* explained by Y
+    NMI1 = MI1*HX0*2 / (HX0+HY0)
+    NMI1
+}
+
+## adjusted entropy
+aH <- function(X,d=rep(1,ncol(X))) {
+    X1 = cbind(rep(1,nrow(X)),X)
+    HX = rep(NA,ncol(X))
+
+    for(i in 2:ncol(X1)){
+        HX[i-1] = condentropy(X1[,i],X1[,i-1])
+    }
+    sum(HX*d)
+}
+
diff --git a/R/wRI.R b/R/wRI.R
@@ -0,0 +1,96 @@
+############################################
+## functions for weighted Rand Index (wRI)
+############################################
+
+## This function computes the weights used for weighted Rand Index,
+## given an expression matrix and true cell classes
+createWeights <- function(Y, classes) {
+    ## normalize by total
+    ss = colMeans(Y)
+    ss = ss / median(ss)
+    Ynorm = sweep(Y, 2, ss, FUN="/")
+
+    ## get cluster means
+    allClasses = unique(classes)
+    mY = matrix(0, nrow=nrow(Y), ncol=length(allClasses))
+    for(i in 1:length(allClasses)) {
+        ix = classes == allClasses[i]
+        mY[,i] = rowMeans(Ynorm[,ix])
+    }
+    colnames(mY) = allClasses
+
+    ## select some marker genes - this step is tricky
+    ix = selectMarker(mY)
+    mY2 = mY[ix,]
+
+    ## W1 is the correlation of cluster centers
+    W1 = cor(mY2) ## note it can be negative
+
+    ## W0 is the average within group variances
+    W0 = matrix(1, nrow=nrow(W1), ncol=ncol(W1))
+    rownames(W0) = rownames(W1)
+    colnames(W0) = colnames(W1)
+    for( i in 1:nrow(W1) ) {
+        ix = classes==allClasses[i]
+        tmp = cor(Y[,ix])
+        diag(tmp) = NA
+        W0[i,i] = 1- mean(tmp, na.rm=TRUE)
+    }
+
+    list(W0=W0, W1=W1)
+}
+
+
+## The main Weighted Rand Index function
+wRI <- function(trueclass, cluster, w0=NULL, w1=NULL) {
+    J = length(unique(trueclass)) #number of true classes
+
+    if (is.null(w0)) {
+        w0 = 1-diag(J)
+        colnames(w0) = names(table(trueclass))
+    } # credit for keeping celltype j1 and j2  separate
+    if (is.null(w1)) {
+        w1 = diag(J)
+        colnames(w1) = names(table(trueclass))
+    }
+
+    T1 = table(cluster, trueclass)
+    T1 = T1[,colnames(w0)]
+    T0 = table(trueclass, trueclass)
+    T0 = T0[colnames(w0), colnames(w0)]
+
+    S1 = GetScore(T1, w0=w0, w1=w1)
+    S0 = GetScore(T0, w0=w0, w1=w1)
+    wRI1 = sum(S1[1:4])/sum(S0[1:4])  ## weighted Rand Index
+
+    NI1 = (S1["N11"]+S1["N01"])/(S1["N11"]+S1["d0"]) #adjusted positive predicative value
+    NI2 = (S1["N00"]+S1["N10"])/(S1["b0"]+S1["c0"]) #adjusted negative predicative value
+    n = length(trueclass) 
+    output = c("wRI"=wRI1,"NI1"=NI1,"NI2"=NI2,"p1"=(S1["N11"]+S1["d0"])/choose(n,2),
+               "p0"=(S1["b0"]+S1["c0"])/choose(n,2))
+    names(output) = c("wRI","NI1","NI2","p1", "p0")
+    output
+
+}
+
+GetScore <- function(T1, w0, w1) {
+    n = sum(T1)
+    J = ncol(T1)
+    I = nrow(T1)
+
+    a = sum(choose(T1,2)) # friends remain friends (N11)
+    c = d = 0 # c:friends separated (N10); d: foes called friends (N01)
+    b0 = c0 = d0 = 0
+
+    for(i in 1:I){
+        for( j in 1:J) {
+            c = c + sum(T1[i,j]*T1[-i,j]*w0[j,j])/2 #partial score in N10
+            d = d + sum(T1[i,j]*T1[i,-j]*w1[j,-j])/2 #partial score in N01
+
+            b0 = b0 + sum(T1[i,j]*colSums(matrix(T1[-i,-j],ncol=J-1)))/2
+            c0 = c0 + sum(T1[i,j]*T1[-i,j])/2
+            d0 = d0 + sum(T1[i,j]*T1[i,-j])/2
+        }
+    }
+    c(N11=a,N00=b0,N10=c,N01=d,b0=b0,c0=c0,d0=d0)
+}
diff --git a/data/Zhengmix8eq.RData b/data/Zhengmix8eq.RData
diff --git a/man/createRef.Rd b/man/createRef.Rd
@@ -0,0 +1,61 @@
+\name{createRef}
+\alias{createRef}
+\title{
+  Create reference hierarchy for computing weighted normalized mutual
+  information (wNMI)
+}
+
+\description{
+  This function takes an expression matrix and true cell classes, and
+  returns two square (JxJ) matrices, where J is the
+  number of classes provided by true cell class.
+
+}
+\usage{
+createRef(Y, classes)
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{Y}{The expression matrix. Rows are genes, columns are cells.}
+  \item{classes}{True cell classes.}
+}
+
+\details{
+The weights used in wNMI are computed based on the heights of the branches
+of the hierarchical tree of different cell types. We first compute mean
+expression profiles for all cell types, and then select 1000 marker
+genes using the same procedure as described in 'createWeights' function. A hierarchical tree
+is then constructed based on the mean expression from marker genes. We
+then obtain the tree height (from the bottom) at each branching
+point. The ratios of these heights to the maximum tree height are used
+as the weight in computing wNMI.
+}
+
+\value{
+  A list with following fields:
+  \item{Ref}{A matrix for the reference cell type clustering when
+	cutting the full hierarchical tree at different branching point. Each column
+	represents a cut. For example, if a column is [1,1,1,2,2,3], it
+	means the first 3 cell types are deemed in the same cluster, the
+	next two are in another cluster, and the last cell type is in a
+	cluster by itself.
+  }
+  \item{weights}{A vector for the relative tree heights at different
+	cut. This will be used as weights in computing wNMI.}
+  \item{hc}{An object of class 'hclust'. This is the hierarchical tree
+	of the reference cell types.} 
+}
+
+\author{
+Zhijin Wu <zhijin_wu@brown.edu>, Hao Wu <hao.wu@emory.edu>
+}
+
+
+\seealso{
+
+}
+\examples{
+data(Zhengmix8eq)
+ctStruct = createRef(Y, trueclass)
+plot(ctStruct$hc)
+}
diff --git a/man/createWeights.Rd b/man/createWeights.Rd
@@ -0,0 +1,76 @@
+\name{createWeights}
+\alias{createWeights}
+
+\title{
+  Create weights used for computing weighted Rand Index (wRI)
+
+}
+\description{
+  This function takes an expression matrix and true cell classes, and
+  returns two square (JxJ) matrices, where J is the
+  number of classes provided by true cell class. 
+}
+
+\usage{
+createWeights(Y, classes)
+}
+
+\arguments{
+  \item{Y}{The expression matrix. Rows are genes, columns are cells.}
+  \item{classes}{True cell classes.}
+}
+
+\details{
+  There are two weight matrices W1 (score for putting two cells in the
+  same cluster) and W0 (score for separating two cells in different
+  clusters). We set W1 has diagonal values 1 and off-diagonal less than
+  1. To obtain off-diagonal values entry (i,j) in W1, we compute the
+  mean expression profiles for all cell types from single-cell data, and
+  then use the Pearson’s correlation of mean expression from cell types
+  i and j as W1[i,j]. Note that in general, correlations among
+  expression from different cell types are high because a majority of
+  genes are not differentially expressed among cell types. To make W1
+  scores more distinctive, we take a marker selection step to pick the
+  top 1000 genes with the largest variances of log expressions. The
+  Pearson’s correlation of mean expression from these genes are then
+  taken as the off-diagonal values for W1[i,j].
+
+  We make W0 has off diagonal values 1 and diagonal between 0 and 1,
+  reflecting that keeping the
+  separation existing in the reference receives full credit, but
+  breaking a (weak) tie may not reduce the score completely to 0. We
+  compute W0[i,i] based on the inter-cellular expression variances within
+  cell type i. To be specific, we take expressions for all cells in cell
+  type i and compute their Pearson’s correlations. W0[i,i] is defined as
+  1 minus the average inter-cellular Pearson’s correlations from all
+  cells. Thus, for a tight cluster where the within cell type
+  correlation is high, the score will be closer to 0, indicating
+  stronger penalty for separating cells for such a cluster. On the other
+  hand, for a loose cluster where the within cell type correlation is
+  low, the score will be larger, indicating weaker penalty.
+}
+
+\value{
+  A list with following fields:
+  \item{W0}{A square matrix of dimension JxJ, where J is the
+	number of classes provided by true cell class. The (i,j)-th entry is
+	the	score of separating two cells of types i and j in different
+	clusters.}
+  \item{W1}{A square matrix of dimension JxJ, where J is the
+	number of classes provided by by true cell class The(i,j)-th entry is
+	the score for putting two cells of different types in the same
+	cluster.}
+}
+
+\author{
+  Zhijin Wu <zhijin_wu@brown.edu>, Hao Wu <hao.wu@emory.edu>
+}
+
+\seealso{
+wRI
+}
+
+\examples{
+data(Zhengmix8eq)
+weights = createWeights(Y, trueclass)
+}