add approximate LAD

jbytecode · Jan 8, 2023 · 29e9e1b · 29e9e1b
1 parent adf8925
commit 29e9e1b
Show file tree

Hide file tree

Showing 15 changed files with 164 additions and 51 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,5 @@
 # Upcoming Release 
+- Add exact argument for LAD. If exact is true then the linear programming based exact solution is found. Otherwise, a GA based search is performed to yield approximate solutions. 
 
 # v0.8.19
 - Update Satman(2013) algorithm

diff --git a/docs/make.jl b/docs/make.jl
@@ -2,25 +2,20 @@ using Documenter, LinRegOutliers
 
 
 makedocs(
-         format = Documenter.HTML(
-                                  prettyurls = get(ENV, "CI", nothing) == "true",
-                                  collapselevel = 2,
-                                  # assets = ["assets/favicon.ico", "assets/extra_styles.css"],
-                                 ),
-         sitename="LinRegOutliers",
-         authors = "Mehmet Hakan Satman <[email protected]>, Shreesh Adiga <[email protected]>, Guillermo Angeris <[email protected]>, Emre Akadal <[email protected]>",
-         pages = [
-                  "Algorithms" => "algorithms.md",
-                  "Diagnostics" =>  "diagnostics.md",
-                  "Types" => "types.md",
-                  "Datasets" => "datasets.md"
-                 ]
-        )
-
-
-deploydocs(
-   repo = "github.com/jbytecode/LinRegOutliers",
+    format = Documenter.HTML(
+        prettyurls = get(ENV, "CI", nothing) == "true",
+        collapselevel = 2,
+        # assets = ["assets/favicon.ico", "assets/extra_styles.css"],
+    ),
+    sitename = "LinRegOutliers",
+    authors = "Mehmet Hakan Satman <[email protected]>, Shreesh Adiga <[email protected]>, Guillermo Angeris <[email protected]>, Emre Akadal <[email protected]>",
+    pages = [
+        "Algorithms" => "algorithms.md",
+        "Diagnostics" => "diagnostics.md",
+        "Types" => "types.md",
+        "Datasets" => "datasets.md",
+    ],
 )
 
 
-
+deploydocs(repo = "github.com/jbytecode/LinRegOutliers")
diff --git a/src/LinRegOutliers.jl b/src/LinRegOutliers.jl
@@ -20,6 +20,19 @@ export find_minimum_nonzero
 export @extractRegressionSetting
 
 
+# Compact genetic algorithm
+include("cga.jl")
+import .CGA: cga
+
+# Hooke-Jeeves algorithm
+include("hj.jl")
+# The function hj() is not exported.
+
+# Genetic Algorithm
+include("ga.jl")
+import .GA: ga, RealChromosome
+
+
 # Predefined datasets used in outlier detection literature
 include("data.jl")
 import .DataSets: phones, hbk, stackloss
@@ -129,14 +142,6 @@ import .Hadi94: hadi1994
 include("dataimage.jl")
 import .DataImage: dataimage
 
-# Compact genetic algorithm
-include("cga.jl")
-import .CGA: cga
-
-
-# Genetic Algorithm
-include("ga.jl")
-import .GA: ga, RealChromosome
 
 # Modified and original Satman (2012) algorithms
 include("gwlts.jl")

diff --git a/src/asm2000.jl b/src/asm2000.jl
@@ -7,7 +7,8 @@ import LinearAlgebra: det
 import Clustering: Hclust, hclust, cutree
 
 
-import ..Basis: RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
+import ..Basis:
+    RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
 import ..Diagnostics: mahalanobisSquaredBetweenPairs
 import ..LTS: lts
 import ..SMR98: majona
@@ -65,7 +66,7 @@ function asm2000(X::Array{Float64,2}, y::Array{Float64,1})::Dict
     #stdfit = standardize(ZScoreTransform, predicteds, dims = 1)
     stdres = zstandardize(resids)
     stdfit = zstandardize(predicteds)
-    
+
     pairs = hcat(stdfit, stdres)
 
     pairs = hcat(resids, predicteds)

diff --git a/src/dataimage.jl b/src/dataimage.jl
@@ -42,7 +42,10 @@ julia> Plots.plot(di)
 Marchette, David J., and Jeffrey L. Solka. "Using data images for outlier detection." 
 Computational Statistics & Data Analysis 43.4 (2003): 541-552.
 """
-function dataimage(dataMatrix::Array{Float64,2}; distance = :mahalanobis)::Array{RGB{Float64}, 2}
+function dataimage(
+    dataMatrix::Array{Float64,2};
+    distance = :mahalanobis,
+)::Array{RGB{Float64},2}
     d = nothing
     if distance == :mahalanobis
         d = mahalanobisSquaredBetweenPairs(dataMatrix)
@@ -55,7 +58,7 @@ function dataimage(dataMatrix::Array{Float64,2}; distance = :mahalanobis)::Array
     end
     colours = 1.0 .- d / maximum(d)
     n, _ = size(d)
-    colormatrix = Array{RGB{Float64}, 2}(undef, n, n)
+    colormatrix = Array{RGB{Float64},2}(undef, n, n)
     for i = 1:n
         for j = 1:n
             @inbounds colormatrix[i, j] = RGB(colours[i, j])

diff --git a/src/diagnostics.jl b/src/diagnostics.jl
@@ -55,7 +55,7 @@ end
 function mahalanobisSquaredBetweenPairs(pairs::Matrix; covmatrix = nothing)
     n, _ = size(pairs)
     newmat = zeros(Float64, n, n)
-    if isnothing(covmatrix) 
+    if isnothing(covmatrix)
         covmatrix = cov(pairs)
     end
     try
@@ -194,7 +194,7 @@ end
 
 function dffit(X::Array{Float64,2}, y::Array{Float64,1}, i::Int)::Float64
     n, _ = size(X)
-    indices = [j for j in 1:n if i != j]
+    indices = [j for j = 1:n if i != j]
     olsfull = ols(X, y)
     Xsub = X[indices, :]
     ysub = y[indices]
@@ -423,7 +423,7 @@ end
 
 function jacknifedS(X::Array{Float64,2}, y::Array{Float64,1}, k::Int)::Float64
     n, p = size(X)
-    indices = [i for i in 1:n if i != k]
+    indices = [i for i = 1:n if i != k]
     Xsub = X[indices, :]
     ysub = y[indices]
     olsreg = ols(Xsub, ysub)

diff --git a/src/hj.jl b/src/hj.jl
@@ -0,0 +1,49 @@
+module HookeJeeves
+
+export hj
+
+function mutate(par, p, d)
+    newpar = copy(par)
+    newpar[p] += d
+    return newpar
+end
+
+function hj(
+    f::FType,
+    par::Vector{Float64};
+    maxiter = 1000,
+    startstep = 5.0,
+    endstep = 0.0001,
+) where {FType<:Function}
+    p = length(par)
+    currentstep = startstep
+    iter::Int64 = 0
+    while iter < maxiter
+        fold = f(par)
+        fnow = fold
+        for currentp = 1:p
+            mutateleft = mutate(par, currentp, -currentstep)
+            fleft = f(mutateleft)
+            mutateright = mutate(par, currentp, currentstep)
+            fright = f(mutateright)
+            if fleft < fold
+                par = mutateleft
+                fnow = fleft
+            elseif fright < fold
+                par = mutateright
+                fnow = fright
+            end
+        end
+        if fold <= fnow
+            currentstep /= 2
+        end
+        if currentstep < endstep
+            break
+        end
+        iter += 1
+    end
+
+    return Dict("par" => par, "iter" => iter, "step" => currentstep)
+end
+
+end # end of module
diff --git a/src/lad.jl b/src/lad.jl
@@ -8,14 +8,18 @@ using GLPK
 import ..Basis:
     RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, applyColumns
 
+import ..HookeJeeves: hj
+import ..GA: ga
+
 """
 
-    lad(setting)
+    lad(setting; exact = true)
 
 Perform Least Absolute Deviations regression for a given regression setting.
 
 # Arguments
 - `setting::RegressionSetting`: RegressionSetting object with a formula and dataset.
+- `exact::Bool`: If true, use exact LAD regression. If false, estimate LAD regression parameters using GA. Default is true.
 
 # Description 
 The LAD estimator searches for regression the parameters estimates that minimize the sum of absolute residuals.
@@ -51,23 +55,52 @@ Dict{Any,Any} with 2 entries:
 ```
 
 """
-function lad(setting::RegressionSetting)
+function lad(setting::RegressionSetting; exact::Bool = true)
     X, y = @extractRegressionSetting setting
-    return lad(X, y)
+    return lad(X, y, exact = exact)
 end
 
 
 """
 
-    lad(X, y)
+    lad(X, y, exact = true)
 
 Perform Least Absolute Deviations regression for a given regression setting.
 
 # Arguments
 - `X::Array{Float64, 2}`: Design matrix of the linear model.
 - `y::Array{Float64, 1}`: Response vector of the linear model.
+- `exact::Bool`: If true, use exact LAD regression. If false, estimate LAD regression parameters using GA. Default is true.
 """
-function lad(X::Array{Float64,2}, y::Array{Float64,1})
+function lad(X::Array{Float64,2}, y::Array{Float64,1}; exact::Bool = true)
+    if exact
+        return lad_exact(X, y)
+    else
+        return lad_approx(X, y)
+    end
+end
+
+function lad_approx(X::Array{Float64,2}, y::Array{Float64,1})
+    n, p = size(X)
+
+    mins = ones(Float64, p) * 10^6 * (-1.0)
+    maxs = ones(Float64, p) * 10^6
+    popsize = 100
+
+    function fcost(par)
+        return sum(abs.(y .- X * par))
+    end
+
+    garesult = ga(popsize, p, fcost, mins, maxs, 0.90, 0.05, 1, p * 1000)
+    best = garesult[1]
+    hookejeevesresult =
+        hj(fcost, best.genes, maxiter = 109000, startstep = 10.0, endstep = 0.000001)
+    betas = hookejeevesresult["par"]
+    result = Dict("betas" => betas, "residuals" => y .- X * betas)
+    return result
+end
+
+function lad_exact(X::Array{Float64,2}, y::Array{Float64,1})
     n, p = size(X)
 
     m = JuMP.Model(GLPK.Optimizer)

diff --git a/src/py95.jl b/src/py95.jl
@@ -148,7 +148,7 @@ function jacknifedS(
     omittedIndices::Array{Int,1},
 )::Float64
     n, p = size(X)
-    indices = [i for i in 1:n if !(i in omittedIndices)]
+    indices = [i for i = 1:n if !(i in omittedIndices)]
     olsreg = ols(X[indices, :], y[indices])
     e = residuals(olsreg)
     s = sqrt(sum(e .^ 2.0) / (n - p - length(omittedIndices)))

diff --git a/src/satman2013.jl b/src/satman2013.jl
@@ -116,7 +116,7 @@ function satman2013(X::Array{Float64,2}, y::Array{Float64,1})
 
     result = Dict()
     result["outliers"] = outlierset
-    result["betas"] = betas 
+    result["betas"] = betas
     result["residuals"] = resids
 
     return result

diff --git a/src/smr98.jl b/src/smr98.jl
@@ -5,7 +5,8 @@ export majona, smr98
 
 
 import Clustering: Hclust, hclust, cutree
-import ..Basis: RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
+import ..Basis:
+    RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
 import Distributions: mean, std
 import ..OrdinaryLeastSquares: ols, residuals, predict
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -33,4 +33,4 @@ include("testatkinson94.jl")
 include("testimon2005.jl")
 include("testcm97.jl")
 include("testbacon2000.jl")
-include("testdataimage.jl")
+include("testdataimage.jl")
diff --git a/test/testdataimage.jl b/test/testdataimage.jl
@@ -15,6 +15,6 @@
         RGB{Float64}(0.3914193805498154, 0.3914193805498154, 0.3914193805498154) RGB{Float64}(0.36901018379996964, 0.36901018379996964, 0.36901018379996964) RGB{Float64}(0.037749551350623745, 0.037749551350623745, 0.037749551350623745) RGB{Float64}(1.0, 1.0, 1.0)
     ]
 
-    @test di isa Matrix 
+    @test di isa Matrix
     @test result == di
 end
diff --git a/test/testlad.jl b/test/testlad.jl
@@ -1,26 +1,28 @@
 @testset "LAD" begin
 
-    @testset "LAD - Algorithm" begin
+    @testset "LAD - Phones - Exact" begin
         eps = 0.0001
         df = phones
         reg = createRegressionSetting(@formula(calls ~ year), df)
         result = lad(reg)
         betas = result["betas"]
-        @test betas[1] < 0
-        @test betas[2] > 0
+        @test abs(betas[1] - -75.19) < eps
+        @test abs(betas[2] - 1.53) < eps
+    end
 
+    @testset "LAD - Algorithm - Exact" begin
         df2 = DataFrame(
             x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
             y = [2, 4, 6, 8, 10, 12, 14, 16, 18, 1000],
         )
         reg2 = createRegressionSetting(@formula(y ~ x), df2)
         result2 = lad(reg2)
         betas2 = result2["betas"]
-        @test abs(betas2[1] - 0) < eps
-        @test abs(betas2[2] - 2) < eps
+        @test betas2[1] == 0.0
+        @test betas2[2] == 2.0
     end
 
-    @testset "LAD with (X, y)" begin
+    @testset "LAD with (X, y) - Phones - Exact" begin
         eps = 0.0001
         df2 = DataFrame(
             x = Float64[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
@@ -35,4 +37,27 @@
         @test abs(betas2[2] - 2) < eps
     end
 
+    @testset "LAD - Phones - Approx" begin
+        eps = 0.0001
+        df = phones
+        reg = createRegressionSetting(@formula(calls ~ year), df)
+        result = lad(reg, exact = false)
+        betas = result["betas"]
+        @test betas[1] < 0
+        @test betas[2] > 0
+    end
+
+    @testset "LAD - Algorithm - Approx" begin
+        eps = 0.1
+        df2 = DataFrame(
+            x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            y = [2, 4, 6, 8, 10, 12, 14, 16, 18, 1000],
+        )
+        reg2 = createRegressionSetting(@formula(y ~ x), df2)
+        result2 = lad(reg2)
+        betas2 = result2["betas"]
+        @test abs(betas2[1] - 0) < eps
+        @test abs(betas2[2] - 2) < eps
+    end
+
 end
diff --git a/test/testquantileregression.jl b/test/testquantileregression.jl
@@ -27,7 +27,7 @@
 
         betas2 = result["betas"]
         @test abs(betas2[1] - 48.0057823) < eps
-        @test abs(betas2[2] - 0.4856801 ) < eps
+        @test abs(betas2[2] - 0.4856801) < eps
     end
 
     @testset "Quantile Regression - q = 0.95" begin