From 86d7b28b3faff62ae1266c25160c24d66e2dcc0a Mon Sep 17 00:00:00 2001 From: Darren Wilkinson Date: Mon, 5 Jun 2017 21:03:24 +0100 Subject: [PATCH] improved diagnostic plots, standardised residuals, improved computation of standard errors --- src/main/scala/scalaglm/Glm.scala | 7 +--- src/main/scala/scalaglm/Lm.scala | 50 ++++++++++++++++++--------- src/main/scala/scalaglm/Predict.scala | 4 +-- src/main/scala/scalaglm/Utils.scala | 8 +++-- src/test/scala/lm-tests.scala | 2 ++ 5 files changed, 44 insertions(+), 27 deletions(-) diff --git a/src/main/scala/scalaglm/Glm.scala b/src/main/scala/scalaglm/Glm.scala index 4e5f598..226793f 100644 --- a/src/main/scala/scalaglm/Glm.scala +++ b/src/main/scala/scalaglm/Glm.scala @@ -121,15 +121,10 @@ case class Glm(y: DenseVector[Double], */ lazy val ri = inv(r) - /** - * Inverse of the final X'WX matrix - */ - lazy val xtwxi = ri * (ri.t) - /** * Standard errors for the regression coefficients */ - lazy val se = breeze.numerics.sqrt(diag(xtwxi)) + lazy val se = norm(ri(*, ::)) /** * z-statistics for the regression coefficients diff --git a/src/main/scala/scalaglm/Lm.scala b/src/main/scala/scalaglm/Lm.scala index fa8b3a0..c1777a0 100644 --- a/src/main/scala/scalaglm/Lm.scala +++ b/src/main/scala/scalaglm/Lm.scala @@ -115,15 +115,10 @@ case class Lm(y: DenseVector[Double], */ lazy val ri = inv(r) - /** - * The inverse of X'X (but calculated efficiently) - */ - lazy val xtxi = ri * (ri.t) - /** * Standard errors for the regression coefficients */ - lazy val se = breeze.numerics.sqrt(diag(xtxi)) * rse + lazy val se = norm(ri(*,::)) * rse /** * t-statistics for the regression coefficients @@ -212,25 +207,48 @@ case class Lm(y: DenseVector[Double], import breeze.plot._ def plots: Figure = { val fig = Figure("Linear regression diagnostics") - val p0 = fig.subplot(2,2,0) + val p0 = fig.subplot(2,2,0) // Obs against fitted p0 += plot(fitted,y,'+') p0 += plot(fitted,fitted) p0.title = "Observations against fitted values" p0.xlabel = "Fitted value" p0.ylabel = "Observation" - val p1 = fig.subplot(2,2,1) - p1 += plot(fitted,residuals,'+') - p1.title = "Residuals against fitted values" + val p1 = fig.subplot(2,2,1) // Studentised against fitted + p1 += plot(fitted,studentised,'+') + p1 += plot(fitted,DenseVector.fill(n)(0.0)) + p1.title = "Studentised residuals against fitted values" p1.xlabel = "Fitted value" - p1.ylabel = "Residual" - val p2 = fig.subplot(2,2,2) - p2 += breeze.plot.hist(residuals) - p2.title = "Histogram of residuals" - p2.xlabel = "Residual" - // TODO: Add a Q-Q plot.. + p1.ylabel = "Studentised residual" + val p2 = fig.subplot(2,2,2) // Histogram of studentised + p2 += breeze.plot.hist(studentised) + p2.title = "Histogram of studentised residuals" + p2.xlabel = "Studentised residual" + p2.ylabel = "Frequency" + val p3 = fig.subplot(2,2,3) // Residual Q-Q plot + val sorted = studentised.toArray.sorted + val grid = linspace(1.0/n,1.0-1.0/n,n) + import breeze.stats.distributions.Gaussian + val quantiles = grid map {Gaussian(0.0,1.0).inverseCdf(_)} + p3 += plot(quantiles, sorted) + p3 += plot(quantiles, quantiles) + p3.title = "Residual Q-Q plot" + p3.xlabel = "Standard normal quantiles" + p3.ylabel = "Studentised residuals" fig } + /** + * Square root of the leverage vector + */ + lazy val sh = norm(q(*,::)) + + /** + * Vector containing the leverages (diagonal of the hat matrix) + */ + lazy val h = sh * sh + + import breeze.numerics.sqrt + lazy val studentised = residuals / sqrt(1.0 - h) / rse } // case class Lm diff --git a/src/main/scala/scalaglm/Predict.scala b/src/main/scala/scalaglm/Predict.scala index e13b304..7316141 100644 --- a/src/main/scala/scalaglm/Predict.scala +++ b/src/main/scala/scalaglm/Predict.scala @@ -45,12 +45,12 @@ case class PredictLm(mod: Lm, newX: DenseMatrix[Double]) extends Predict { /** * for internal use (probably should be marked private) */ - lazy val rtix = nX * mod.ri + lazy val xrti = nX * mod.ri /** * standard errors for the predictions */ - lazy val se = norm(rtix(*, ::)) * mod.rse + lazy val se = norm(xrti(*, ::)) * mod.rse } // case class PredictLm diff --git a/src/main/scala/scalaglm/Utils.scala b/src/main/scala/scalaglm/Utils.scala index b62fea5..9746eb7 100644 --- a/src/main/scala/scalaglm/Utils.scala +++ b/src/main/scala/scalaglm/Utils.scala @@ -138,11 +138,11 @@ object Utils { if (i == j) { pij += hist(mat(::,i)) pij.title = names(i) - pij.xlabel = i.toString + pij.xlabel = names(i) } else { pij += plot(mat(::,j),mat(::,i),'.') - pij.xlabel = j.toString - pij.ylabel = i.toString + pij.xlabel = names(j) + pij.ylabel = names(i) } } } @@ -183,10 +183,12 @@ object Utils { // read the file from disk val mat = csvread(new java.io.File(fileName)) println("Dim: " + mat.rows + " " + mat.cols) + Utils.pairs(mat, List("Freq", "Angle", "Chord", "Velo", "Thick", "Sound")) val y = mat(::, 5) // response is the final column val X = mat(::, 0 to 4) val mod = Lm(y, X, List("Freq", "Angle", "Chord", "Velo", "Thick")) mod.summary + mod.plots // test without name list Lm(y,X,false).summary diff --git a/src/test/scala/lm-tests.scala b/src/test/scala/lm-tests.scala index 8bdee72..556bdae 100644 --- a/src/test/scala/lm-tests.scala +++ b/src/test/scala/lm-tests.scala @@ -86,6 +86,8 @@ class LmSpec extends FlatSpec { assert(norm(mod.p - rP) <= 0.00001) val rF = DenseVector[Double](R.evalD1("mod$fitted.values")) assert(norm(mod.fitted - rF) <= 0.0001) + val rStud = DenseVector[Double](R.evalD1("rstandard(mod)")) + assert(norm(mod.studentised - rStud) <= 0.0001) val rPred = DenseVector[Double](R.evalD1("predict(mod)")) assert(norm(mod.predict().fitted - rPred) <= 0.0001) val rPredSe = DenseVector[Double](R.evalD1("predict(mod,se.fit=TRUE)$se.fit"))