From 86d7b28b3faff62ae1266c25160c24d66e2dcc0a Mon Sep 17 00:00:00 2001
From: Darren Wilkinson <darrenjwilkinson@btinternet.com>
Date: Mon, 5 Jun 2017 21:03:24 +0100
Subject: [PATCH] improved diagnostic plots, standardised residuals, improved
 computation of standard errors

---
 src/main/scala/scalaglm/Glm.scala     |  7 +---
 src/main/scala/scalaglm/Lm.scala      | 50 ++++++++++++++++++---------
 src/main/scala/scalaglm/Predict.scala |  4 +--
 src/main/scala/scalaglm/Utils.scala   |  8 +++--
 src/test/scala/lm-tests.scala         |  2 ++
 5 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/src/main/scala/scalaglm/Glm.scala b/src/main/scala/scalaglm/Glm.scala
index 4e5f598..226793f 100644
--- a/src/main/scala/scalaglm/Glm.scala
+++ b/src/main/scala/scalaglm/Glm.scala
@@ -121,15 +121,10 @@ case class Glm(y: DenseVector[Double],
     */
   lazy val ri = inv(r)
 
-  /** 
-    * Inverse of the final X'WX matrix
-    */
-  lazy val xtwxi = ri * (ri.t)
-
   /** 
     * Standard errors for the regression coefficients
     */
-  lazy val se = breeze.numerics.sqrt(diag(xtwxi))
+  lazy val se = norm(ri(*, ::))
 
   /** 
     * z-statistics for the regression coefficients
diff --git a/src/main/scala/scalaglm/Lm.scala b/src/main/scala/scalaglm/Lm.scala
index fa8b3a0..c1777a0 100644
--- a/src/main/scala/scalaglm/Lm.scala
+++ b/src/main/scala/scalaglm/Lm.scala
@@ -115,15 +115,10 @@ case class Lm(y: DenseVector[Double],
     */
   lazy val ri = inv(r)
 
-  /** 
-    * The inverse of X'X (but calculated efficiently)
-    */
-  lazy val xtxi = ri * (ri.t)
-
   /**  
     * Standard errors for the regression coefficients
     */
-  lazy val se = breeze.numerics.sqrt(diag(xtxi)) * rse
+  lazy val se = norm(ri(*,::)) * rse
 
   /** 
     * t-statistics for the regression coefficients
@@ -212,25 +207,48 @@ case class Lm(y: DenseVector[Double],
   import breeze.plot._
   def plots: Figure = {
     val fig = Figure("Linear regression diagnostics")
-    val p0 = fig.subplot(2,2,0)
+    val p0 = fig.subplot(2,2,0) // Obs against fitted
     p0 += plot(fitted,y,'+')
     p0 += plot(fitted,fitted)
     p0.title = "Observations against fitted values"
     p0.xlabel = "Fitted value"
     p0.ylabel = "Observation"
-    val p1 = fig.subplot(2,2,1)
-    p1 += plot(fitted,residuals,'+')
-    p1.title = "Residuals against fitted values"
+    val p1 = fig.subplot(2,2,1) // Studentised against fitted
+    p1 += plot(fitted,studentised,'+')
+    p1 += plot(fitted,DenseVector.fill(n)(0.0))
+    p1.title = "Studentised residuals against fitted values"
     p1.xlabel = "Fitted value"
-    p1.ylabel = "Residual"
-    val p2 = fig.subplot(2,2,2)
-    p2 += breeze.plot.hist(residuals)
-    p2.title = "Histogram of residuals"
-    p2.xlabel = "Residual"
-    // TODO: Add a Q-Q plot..
+    p1.ylabel = "Studentised residual"
+    val p2 = fig.subplot(2,2,2) // Histogram of studentised
+    p2 += breeze.plot.hist(studentised)
+    p2.title = "Histogram of studentised residuals"
+    p2.xlabel = "Studentised residual"
+    p2.ylabel = "Frequency"
+    val p3 = fig.subplot(2,2,3) // Residual Q-Q plot
+    val sorted = studentised.toArray.sorted
+    val grid = linspace(1.0/n,1.0-1.0/n,n)
+    import breeze.stats.distributions.Gaussian
+    val quantiles = grid map {Gaussian(0.0,1.0).inverseCdf(_)}
+    p3 += plot(quantiles, sorted)
+    p3 += plot(quantiles, quantiles)
+    p3.title = "Residual Q-Q plot"
+    p3.xlabel = "Standard normal quantiles"
+    p3.ylabel = "Studentised residuals"
     fig
   }
 
+  /** 
+    * Square root of the leverage vector
+    */
+  lazy val sh = norm(q(*,::))
+
+  /** 
+    * Vector containing the leverages (diagonal of the hat matrix)
+    */
+  lazy val h = sh * sh
+
+  import breeze.numerics.sqrt
+  lazy val studentised = residuals / sqrt(1.0 - h) / rse
 
 } // case class Lm
 
diff --git a/src/main/scala/scalaglm/Predict.scala b/src/main/scala/scalaglm/Predict.scala
index e13b304..7316141 100644
--- a/src/main/scala/scalaglm/Predict.scala
+++ b/src/main/scala/scalaglm/Predict.scala
@@ -45,12 +45,12 @@ case class PredictLm(mod: Lm, newX: DenseMatrix[Double]) extends Predict {
   /** 
     * for internal use (probably should be marked private)
     */
-  lazy val rtix = nX * mod.ri
+  lazy val xrti = nX * mod.ri
 
   /** 
     * standard errors for the predictions
     */
-  lazy val se = norm(rtix(*, ::)) * mod.rse
+  lazy val se = norm(xrti(*, ::)) * mod.rse
 } // case class PredictLm
 
 
diff --git a/src/main/scala/scalaglm/Utils.scala b/src/main/scala/scalaglm/Utils.scala
index b62fea5..9746eb7 100644
--- a/src/main/scala/scalaglm/Utils.scala
+++ b/src/main/scala/scalaglm/Utils.scala
@@ -138,11 +138,11 @@ object Utils {
           if (i == j) {
             pij += hist(mat(::,i))
             pij.title = names(i)
-            pij.xlabel = i.toString
+            pij.xlabel = names(i)
           } else {
             pij += plot(mat(::,j),mat(::,i),'.')
-            pij.xlabel = j.toString
-            pij.ylabel = i.toString
+            pij.xlabel = names(j)
+            pij.ylabel = names(i)
           }
         }
       }
@@ -183,10 +183,12 @@ object Utils {
     // read the file from disk
     val mat = csvread(new java.io.File(fileName))
     println("Dim: " + mat.rows + " " + mat.cols)
+    Utils.pairs(mat, List("Freq", "Angle", "Chord", "Velo", "Thick", "Sound"))
     val y = mat(::, 5) // response is the final column
     val X = mat(::, 0 to 4)
     val mod = Lm(y, X, List("Freq", "Angle", "Chord", "Velo", "Thick"))
     mod.summary
+    mod.plots
 
     // test without name list
     Lm(y,X,false).summary
diff --git a/src/test/scala/lm-tests.scala b/src/test/scala/lm-tests.scala
index 8bdee72..556bdae 100644
--- a/src/test/scala/lm-tests.scala
+++ b/src/test/scala/lm-tests.scala
@@ -86,6 +86,8 @@ class LmSpec extends FlatSpec {
     assert(norm(mod.p - rP) <= 0.00001)
     val rF = DenseVector[Double](R.evalD1("mod$fitted.values"))
     assert(norm(mod.fitted - rF) <= 0.0001)
+    val rStud = DenseVector[Double](R.evalD1("rstandard(mod)"))
+    assert(norm(mod.studentised - rStud) <= 0.0001)
     val rPred = DenseVector[Double](R.evalD1("predict(mod)"))
     assert(norm(mod.predict().fitted - rPred) <= 0.0001)
     val rPredSe = DenseVector[Double](R.evalD1("predict(mod,se.fit=TRUE)$se.fit"))