Skip to content

Commit

Permalink
add approximate LAD
Browse files Browse the repository at this point in the history
  • Loading branch information
jbytecode committed Jan 8, 2023
1 parent adf8925 commit 29e9e1b
Show file tree
Hide file tree
Showing 15 changed files with 164 additions and 51 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Upcoming Release
- Add exact argument for LAD. If exact is true then the linear programming based exact solution is found. Otherwise, a GA based search is performed to yield approximate solutions.

# v0.8.19
- Update Satman(2013) algorithm
Expand Down
33 changes: 14 additions & 19 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,20 @@ using Documenter, LinRegOutliers


makedocs(
format = Documenter.HTML(
prettyurls = get(ENV, "CI", nothing) == "true",
collapselevel = 2,
# assets = ["assets/favicon.ico", "assets/extra_styles.css"],
),
sitename="LinRegOutliers",
authors = "Mehmet Hakan Satman <[email protected]>, Shreesh Adiga <[email protected]>, Guillermo Angeris <[email protected]>, Emre Akadal <[email protected]>",
pages = [
"Algorithms" => "algorithms.md",
"Diagnostics" => "diagnostics.md",
"Types" => "types.md",
"Datasets" => "datasets.md"
]
)


deploydocs(
repo = "github.com/jbytecode/LinRegOutliers",
format = Documenter.HTML(
prettyurls = get(ENV, "CI", nothing) == "true",
collapselevel = 2,
# assets = ["assets/favicon.ico", "assets/extra_styles.css"],
),
sitename = "LinRegOutliers",
authors = "Mehmet Hakan Satman <[email protected]>, Shreesh Adiga <[email protected]>, Guillermo Angeris <[email protected]>, Emre Akadal <[email protected]>",
pages = [
"Algorithms" => "algorithms.md",
"Diagnostics" => "diagnostics.md",
"Types" => "types.md",
"Datasets" => "datasets.md",
],
)



deploydocs(repo = "github.com/jbytecode/LinRegOutliers")
21 changes: 13 additions & 8 deletions src/LinRegOutliers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,19 @@ export find_minimum_nonzero
export @extractRegressionSetting


# Compact genetic algorithm
include("cga.jl")
import .CGA: cga

# Hooke-Jeeves algorithm
include("hj.jl")
# The function hj() is not exported.

# Genetic Algorithm
include("ga.jl")
import .GA: ga, RealChromosome


# Predefined datasets used in outlier detection literature
include("data.jl")
import .DataSets: phones, hbk, stackloss
Expand Down Expand Up @@ -129,14 +142,6 @@ import .Hadi94: hadi1994
include("dataimage.jl")
import .DataImage: dataimage

# Compact genetic algorithm
include("cga.jl")
import .CGA: cga


# Genetic Algorithm
include("ga.jl")
import .GA: ga, RealChromosome

# Modified and original Satman (2012) algorithms
include("gwlts.jl")
Expand Down
5 changes: 3 additions & 2 deletions src/asm2000.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ import LinearAlgebra: det
import Clustering: Hclust, hclust, cutree


import ..Basis: RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
import ..Basis:
RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
import ..Diagnostics: mahalanobisSquaredBetweenPairs
import ..LTS: lts
import ..SMR98: majona
Expand Down Expand Up @@ -65,7 +66,7 @@ function asm2000(X::Array{Float64,2}, y::Array{Float64,1})::Dict
#stdfit = standardize(ZScoreTransform, predicteds, dims = 1)
stdres = zstandardize(resids)
stdfit = zstandardize(predicteds)

pairs = hcat(stdfit, stdres)

pairs = hcat(resids, predicteds)
Expand Down
7 changes: 5 additions & 2 deletions src/dataimage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ julia> Plots.plot(di)
Marchette, David J., and Jeffrey L. Solka. "Using data images for outlier detection."
Computational Statistics & Data Analysis 43.4 (2003): 541-552.
"""
function dataimage(dataMatrix::Array{Float64,2}; distance = :mahalanobis)::Array{RGB{Float64}, 2}
function dataimage(
dataMatrix::Array{Float64,2};
distance = :mahalanobis,
)::Array{RGB{Float64},2}
d = nothing
if distance == :mahalanobis
d = mahalanobisSquaredBetweenPairs(dataMatrix)
Expand All @@ -55,7 +58,7 @@ function dataimage(dataMatrix::Array{Float64,2}; distance = :mahalanobis)::Array
end
colours = 1.0 .- d / maximum(d)
n, _ = size(d)
colormatrix = Array{RGB{Float64}, 2}(undef, n, n)
colormatrix = Array{RGB{Float64},2}(undef, n, n)
for i = 1:n
for j = 1:n
@inbounds colormatrix[i, j] = RGB(colours[i, j])
Expand Down
6 changes: 3 additions & 3 deletions src/diagnostics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ end
function mahalanobisSquaredBetweenPairs(pairs::Matrix; covmatrix = nothing)
n, _ = size(pairs)
newmat = zeros(Float64, n, n)
if isnothing(covmatrix)
if isnothing(covmatrix)
covmatrix = cov(pairs)
end
try
Expand Down Expand Up @@ -194,7 +194,7 @@ end

function dffit(X::Array{Float64,2}, y::Array{Float64,1}, i::Int)::Float64
n, _ = size(X)
indices = [j for j in 1:n if i != j]
indices = [j for j = 1:n if i != j]
olsfull = ols(X, y)
Xsub = X[indices, :]
ysub = y[indices]
Expand Down Expand Up @@ -423,7 +423,7 @@ end

function jacknifedS(X::Array{Float64,2}, y::Array{Float64,1}, k::Int)::Float64
n, p = size(X)
indices = [i for i in 1:n if i != k]
indices = [i for i = 1:n if i != k]
Xsub = X[indices, :]
ysub = y[indices]
olsreg = ols(Xsub, ysub)
Expand Down
49 changes: 49 additions & 0 deletions src/hj.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
module HookeJeeves

export hj

function mutate(par, p, d)
newpar = copy(par)
newpar[p] += d
return newpar
end

function hj(
f::FType,
par::Vector{Float64};
maxiter = 1000,
startstep = 5.0,
endstep = 0.0001,
) where {FType<:Function}
p = length(par)
currentstep = startstep
iter::Int64 = 0
while iter < maxiter
fold = f(par)
fnow = fold
for currentp = 1:p
mutateleft = mutate(par, currentp, -currentstep)
fleft = f(mutateleft)
mutateright = mutate(par, currentp, currentstep)
fright = f(mutateright)
if fleft < fold
par = mutateleft
fnow = fleft
elseif fright < fold
par = mutateright
fnow = fright
end
end
if fold <= fnow
currentstep /= 2
end
if currentstep < endstep
break
end
iter += 1
end

return Dict("par" => par, "iter" => iter, "step" => currentstep)
end

end # end of module
43 changes: 38 additions & 5 deletions src/lad.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,18 @@ using GLPK
import ..Basis:
RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, applyColumns

import ..HookeJeeves: hj
import ..GA: ga

"""
lad(setting)
lad(setting; exact = true)
Perform Least Absolute Deviations regression for a given regression setting.
# Arguments
- `setting::RegressionSetting`: RegressionSetting object with a formula and dataset.
- `exact::Bool`: If true, use exact LAD regression. If false, estimate LAD regression parameters using GA. Default is true.
# Description
The LAD estimator searches for regression the parameters estimates that minimize the sum of absolute residuals.
Expand Down Expand Up @@ -51,23 +55,52 @@ Dict{Any,Any} with 2 entries:
```
"""
function lad(setting::RegressionSetting)
function lad(setting::RegressionSetting; exact::Bool = true)
X, y = @extractRegressionSetting setting
return lad(X, y)
return lad(X, y, exact = exact)
end


"""
lad(X, y)
lad(X, y, exact = true)
Perform Least Absolute Deviations regression for a given regression setting.
# Arguments
- `X::Array{Float64, 2}`: Design matrix of the linear model.
- `y::Array{Float64, 1}`: Response vector of the linear model.
- `exact::Bool`: If true, use exact LAD regression. If false, estimate LAD regression parameters using GA. Default is true.
"""
function lad(X::Array{Float64,2}, y::Array{Float64,1})
function lad(X::Array{Float64,2}, y::Array{Float64,1}; exact::Bool = true)
if exact
return lad_exact(X, y)
else
return lad_approx(X, y)
end
end

function lad_approx(X::Array{Float64,2}, y::Array{Float64,1})
n, p = size(X)

mins = ones(Float64, p) * 10^6 * (-1.0)
maxs = ones(Float64, p) * 10^6
popsize = 100

function fcost(par)
return sum(abs.(y .- X * par))
end

garesult = ga(popsize, p, fcost, mins, maxs, 0.90, 0.05, 1, p * 1000)
best = garesult[1]
hookejeevesresult =
hj(fcost, best.genes, maxiter = 109000, startstep = 10.0, endstep = 0.000001)
betas = hookejeevesresult["par"]
result = Dict("betas" => betas, "residuals" => y .- X * betas)
return result
end

function lad_exact(X::Array{Float64,2}, y::Array{Float64,1})
n, p = size(X)

m = JuMP.Model(GLPK.Optimizer)
Expand Down
2 changes: 1 addition & 1 deletion src/py95.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ function jacknifedS(
omittedIndices::Array{Int,1},
)::Float64
n, p = size(X)
indices = [i for i in 1:n if !(i in omittedIndices)]
indices = [i for i = 1:n if !(i in omittedIndices)]
olsreg = ols(X[indices, :], y[indices])
e = residuals(olsreg)
s = sqrt(sum(e .^ 2.0) / (n - p - length(omittedIndices)))
Expand Down
2 changes: 1 addition & 1 deletion src/satman2013.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ function satman2013(X::Array{Float64,2}, y::Array{Float64,1})

result = Dict()
result["outliers"] = outlierset
result["betas"] = betas
result["betas"] = betas
result["residuals"] = resids

return result
Expand Down
3 changes: 2 additions & 1 deletion src/smr98.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ export majona, smr98


import Clustering: Hclust, hclust, cutree
import ..Basis: RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
import ..Basis:
RegressionSetting, @extractRegressionSetting, designMatrix, responseVector, zstandardize
import Distributions: mean, std
import ..OrdinaryLeastSquares: ols, residuals, predict

Expand Down
2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,4 @@ include("testatkinson94.jl")
include("testimon2005.jl")
include("testcm97.jl")
include("testbacon2000.jl")
include("testdataimage.jl")
include("testdataimage.jl")
2 changes: 1 addition & 1 deletion test/testdataimage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
RGB{Float64}(0.3914193805498154, 0.3914193805498154, 0.3914193805498154) RGB{Float64}(0.36901018379996964, 0.36901018379996964, 0.36901018379996964) RGB{Float64}(0.037749551350623745, 0.037749551350623745, 0.037749551350623745) RGB{Float64}(1.0, 1.0, 1.0)
]

@test di isa Matrix
@test di isa Matrix
@test result == di
end
37 changes: 31 additions & 6 deletions test/testlad.jl
Original file line number Diff line number Diff line change
@@ -1,26 +1,28 @@
@testset "LAD" begin

@testset "LAD - Algorithm" begin
@testset "LAD - Phones - Exact" begin
eps = 0.0001
df = phones
reg = createRegressionSetting(@formula(calls ~ year), df)
result = lad(reg)
betas = result["betas"]
@test betas[1] < 0
@test betas[2] > 0
@test abs(betas[1] - -75.19) < eps
@test abs(betas[2] - 1.53) < eps
end

@testset "LAD - Algorithm - Exact" begin
df2 = DataFrame(
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
y = [2, 4, 6, 8, 10, 12, 14, 16, 18, 1000],
)
reg2 = createRegressionSetting(@formula(y ~ x), df2)
result2 = lad(reg2)
betas2 = result2["betas"]
@test abs(betas2[1] - 0) < eps
@test abs(betas2[2] - 2) < eps
@test betas2[1] == 0.0
@test betas2[2] == 2.0
end

@testset "LAD with (X, y)" begin
@testset "LAD with (X, y) - Phones - Exact" begin
eps = 0.0001
df2 = DataFrame(
x = Float64[1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
Expand All @@ -35,4 +37,27 @@
@test abs(betas2[2] - 2) < eps
end

@testset "LAD - Phones - Approx" begin
eps = 0.0001
df = phones
reg = createRegressionSetting(@formula(calls ~ year), df)
result = lad(reg, exact = false)
betas = result["betas"]
@test betas[1] < 0
@test betas[2] > 0
end

@testset "LAD - Algorithm - Approx" begin
eps = 0.1
df2 = DataFrame(
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
y = [2, 4, 6, 8, 10, 12, 14, 16, 18, 1000],
)
reg2 = createRegressionSetting(@formula(y ~ x), df2)
result2 = lad(reg2)
betas2 = result2["betas"]
@test abs(betas2[1] - 0) < eps
@test abs(betas2[2] - 2) < eps
end

end
2 changes: 1 addition & 1 deletion test/testquantileregression.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

betas2 = result["betas"]
@test abs(betas2[1] - 48.0057823) < eps
@test abs(betas2[2] - 0.4856801 ) < eps
@test abs(betas2[2] - 0.4856801) < eps
end

@testset "Quantile Regression - q = 0.95" begin
Expand Down

0 comments on commit 29e9e1b

Please sign in to comment.