Skip to content

Commit

Permalink
implement Chatterjee and Mächler (1997) algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
jbytecode committed Oct 28, 2020
1 parent 17f4de3 commit f4254b6
Show file tree
Hide file tree
Showing 10 changed files with 153 additions and 22 deletions.
28 changes: 14 additions & 14 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ version = "2.3.0"

[[ArrayInterface]]
deps = ["LinearAlgebra", "Requires", "SparseArrays"]
git-tree-sha1 = "920136b6b8ae5bd28a3c45d68e55421dec156d7f"
git-tree-sha1 = "bd09109dffaa3926a20178cb8432edd729be0db0"
uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
version = "2.13.6"
version = "2.13.7"

[[Artifacts]]
deps = ["Pkg"]
Expand Down Expand Up @@ -70,9 +70,9 @@ version = "0.3.0"

[[Compat]]
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
git-tree-sha1 = "b4f242d5e9f65648d438a03d245a9fdc6e38d728"
git-tree-sha1 = "a706ff10f1cd8dab94f59fd09c0e657db8e77ff0"
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
version = "3.22.0"
version = "3.23.0"

[[CompilerSupportLibraries_jll]]
deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
Expand All @@ -99,9 +99,9 @@ version = "0.21.8"

[[DataStructures]]
deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
git-tree-sha1 = "db07bb22795762895b60e44d62b34b16c982a687"
git-tree-sha1 = "fb0aa371da91c1ff9dc7fbed6122d3e411420b9c"
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
version = "0.18.7"
version = "0.18.8"

[[DataValueInterfaces]]
git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
Expand Down Expand Up @@ -264,9 +264,9 @@ uuid = "82899510-4779-5014-852e-03e436cf321d"
version = "1.0.0"

[[JLLWrappers]]
git-tree-sha1 = "7cec881362e5b4e367ff0279dd99a06526d51a55"
git-tree-sha1 = "c70593677bbf2c3ccab4f7500d0f4dacfff7b75c"
uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
version = "1.1.2"
version = "1.1.3"

[[JSON]]
deps = ["Dates", "Mmap", "Parsers", "Unicode"]
Expand Down Expand Up @@ -329,9 +329,9 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"

[[MbedTLS]]
deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"]
git-tree-sha1 = "426a6978b03a97ceb7ead77775a1da066343ec6e"
git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe"
uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
version = "1.0.2"
version = "1.0.3"

[[MbedTLS_jll]]
deps = ["Libdl", "Pkg"]
Expand Down Expand Up @@ -441,9 +441,9 @@ version = "1.0.7"

[[Plots]]
deps = ["Base64", "Contour", "Dates", "FFMPEG", "FixedPointNumbers", "GR", "GeometryBasics", "GeometryTypes", "JSON", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs"]
git-tree-sha1 = "61ece2ce87bbacbea01d3752aef1af89409d6a55"
git-tree-sha1 = "4d94e6b49fbb0c479e71e12aa1446c4053f5a25e"
uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
version = "1.7.1"
version = "1.7.3"

[[PooledArrays]]
deps = ["DataAPI"]
Expand Down Expand Up @@ -558,9 +558,9 @@ version = "0.10.3"

[[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "016d1e1a00fabc556473b07161da3d39726ded35"
git-tree-sha1 = "da4cf579416c81994afd6322365d00916c79b8ae"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "0.12.4"
version = "0.12.5"

[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "LinRegOutliers"
uuid = "6d4de0fb-32d9-4c65-aac1-cc9ed8b94b1a"
authors = ["Mehmet Hakan Satman <[email protected]>", "Shreesh Adiga <[email protected]>"]
version = "0.7.0"
version = "0.8.0"

[deps]
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ A Julia package for outlier detection in linear regression.
- Atkinson (1994) Forward Search Algorithm
- BACON Algorithm (Billor & Hadi & Velleman (2000))
- Hadi (1994) Algorithm
- Chatterjee & Mächler (1997)
- Summary


## Unimplemented Methods
- Chatterjee & Mächler (1997)
- Depth based estimators (Regression depth, deepest regression, etc.)
- Theil & Sen estimator for mutliple regression

Expand Down
5 changes: 2 additions & 3 deletions src/.dev/outlier.jl
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,8 @@ include("../atkinsonstalactiteplot.jl")
include("../imon2005.jl")
include("../ccf.jl")
include("../bacon.jl")
include("../cm97.jl")

println("ready")

using BenchmarkTools
println("ready")

reg = createRegressionSetting(@formula(y ~ x1 + x2 + x3), hbk)
10 changes: 10 additions & 0 deletions src/.dev/test.jl
Original file line number Diff line number Diff line change
Expand Up @@ -667,3 +667,13 @@ end
result = bacon(reg, m=12)
@test result == [1, 3, 4, 21]
end


@testset "Chatterjee & Mächler (1997) cm97 with stackloss" begin
df2 = stackloss
reg = createRegressionSetting(@formula(stackloss ~ airflow + watertemp + acidcond), stackloss)
regresult = cm97(reg)
betas = regresult["betas"]
betas_in_article = [-37.00, 0.84, 0.63, -0.11]
@test isapprox(betas, betas_in_article, atol=0.01)
end
6 changes: 5 additions & 1 deletion src/LinRegOutliers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@ include("bacon.jl")
# Imon 2005 Algorithm
include("imon2005.jl")

#  Chatterjee & Mächler (1997) Algorithm
include("cm97.jl")

# All-in-one
include("summary.jl")

Expand All @@ -132,7 +135,7 @@ export find_minimum_nonzero
# Data
export phones, hbk, stackloss
export weightloss, hs93randomdata, woodgravity
export hills
export hills, softdrinkdelivery

# Diagnostics
export dffit, dfbeta
Expand Down Expand Up @@ -172,5 +175,6 @@ export ccf
export imon2005
export atkinson94, atkinsonstalactiteplot, generate_stalactite_plot
export bacon
export cm97

end # module
81 changes: 81 additions & 0 deletions src/cm97.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
cm97(setting; maxiter = 1000)
Perform the Chatterjee and Mächler (1997) algorithm for the given regression setting.
# Arguments
- `setting::RegressionSetting`: RegressionSetting object with a formula and dataset.
# Examples
```julia-repl
julia> myreg = createRegressionSetting(@formula(stackloss ~ airflow + watertemp + acidcond), stackloss)
julia> result = cm97(myreg)
Dict{String,Any} with 3 entries:
"betas" => [-37.0007, 0.839285, 0.632333, -0.113208]
"iterations" => 22
"converged" => true
```
# References
Chatterjee, Samprit, and Martin Mächler. "Robust regression: A weighted least squares approach."
Communications in Statistics-Theory and Methods 26.6 (1997): 1381-1394.
"""
function cm97(setting::RegressionSetting; maxiter::Int=1000)

X = designMatrix(setting)
y = responseVector(setting)
return cm97(X, y)

end



function cm97(X::Array{Float64,2}, y::Array{Float64,1}; maxiter::Int=1000)

n, p = size(X)
pbar::Float64 = p / n
hat = hatmatrix(X)

w_is = Array{Float64}(undef, n)
betas = Array{Float64}(undef, p)

converged::Bool = false
iter::Int = 0

# initial weights
@inbounds for i in 1:n
w_is[i] = 1.0 / max(hat[i, i], pbar)
end

while iter <= maxiter
oldbetas = betas

wols = wls(X, y, w_is)
betas = coef(wols)
r = y - X * betas
medi = median(abs.(r))

@inbounds for i in 1:n
w_is[i] = (1.0 - hat[i, i])^2.0 / max(abs(r[i]), medi)
end

if isapprox(oldbetas, betas)
converged = true
break
end

iter += 1

end

result = Dict{String,Any}(
"converged" => converged,
"betas" => betas,
"iterations" => iter
)

return result
end


26 changes: 25 additions & 1 deletion src/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -177,4 +177,28 @@ const hills = DataFrame(
dist=[2.4, 6.0, 6.0, 7.5, 8.0, 8.0, 16.0, 6.0, 5.0, 6.0, 28.0, 5.0, 9.5, 6.0, 4.5, 10.0, 14.0, 3.0, 4.5, 5.5, 3.0, 3.5, 6.0, 2.0, 3.0, 4.0, 6.0, 5.0, 6.5, 5.0, 10.0, 6.0, 18.0, 4.5, 20.0],
climb=[650.0, 2500.0, 900.0, 800.0, 3070.0, 2866.0, 7500.0, 800.0, 800.0, 650.0, 2100.0, 2000.0, 2200.0, 500.0, 1500.0, 3000.0, 2200.0, 350.0, 1000.0, 600.0, 300.0, 1500.0, 2200.0, 900.0, 600.0, 2000.0, 800.0, 950.0, 1750.0, 500.0, 4400.0, 600.0, 5200.0, 850.0, 5000.0],
time=[0.268055555555556, 0.805833333333333, 0.560833333333333, 0.76, 1.03777777777778, 1.22027777777778, 3.41027777777778, 0.606111111111111, 0.495833333333333, 0.6625, 3.21111111111111, 0.7175, 1.08333333333333, 0.735555555555556, 0.448888888888889, 1.20416666666667, 1.64027777777778, 1.31083333333333, 0.290277777777778, 0.542777777777778, 0.265833333333333, 0.465, 0.794166666666667, 0.298888888888889, 0.311388888888889, 0.436944444444444, 0.573888888888889, 0.476111111111111, 0.841666666666667, 0.349166666666667, 1.42638888888889, 0.539722222222222, 2.8375, 0.468333333333333, 2.66388888888889]
)
)




"""
Soft Drink Delivery Data
# Components
- `cases::Array{Float64, 1}`: Independent variable.
- `distance::Array{Float64, 1}`: Independent variable.
- `time::Array{Float64, 1}`: Dependent variable.
# Model
time ~ distance + cases
# Reference
D. C. Montgomery and E. A. Peck (1992) Introduction to Regression Analysis. Wiley, New York.
"""
const softdrinkdelivery = DataFrame(
time=[16.68, 11.5, 12.03, 14.88, 13.75, 18.11, 8.0, 17.83, 79.24, 21.5, 40.33, 21.0, 13.5, 19.75, 24.0, 29.0, 15.35, 19.0, 9.5, 35.1, 17.9, 52.32, 18.75, 19.83, 10.75],
distance=[560.0, 220.0, 340.0, 80.0, 150.0, 330.0, 110.0, 210.0, 1460.0, 605.0, 688.0, 215.0, 255.0, 462.0, 448.0, 776.0, 200.0, 132.0, 36.0, 770.0, 140.0, 810.0, 450.0, 635.0, 150.0],
cases=[7.0, 3.0, 3.0, 4.0, 6.0, 7.0, 2.0, 7.0, 30.0, 5.0, 16.0, 10.0, 4.0, 6.0, 9.0, 10.0, 6.0, 7.0, 3.0, 17.0, 10.0, 26.0, 9.0, 8.0, 4.0]
)

5 changes: 4 additions & 1 deletion src/ols.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ ols(X::Array{Float64,2}, y::Array{Float64,1})::OLS = OLS(X, y, qr(X, Val(true))

function wls(X::Array{Float64,2}, y::Array{Float64,1}, wts::Array{Float64,1})
W = Diagonal(sqrt.(wts))
return ols(W*X, W*y)
#  I commented this because passing weighted values of X and y to OLS
#  causes wrong calculations of residuals.
# return ols(W * X, W * y)
return OLS(X, y, qr(W * X, Val(true)) \ (W * y))
end

residuals(ols::OLS) = ols.y .- ols.X * ols.betas
Expand Down
10 changes: 10 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -682,3 +682,13 @@ end
result = bacon(reg, m=12)
@test result == [1, 3, 4, 21]
end

@testset "Chatterjee & Mächler (1997) cm97 with stackloss" begin
df2 = stackloss
reg = createRegressionSetting(@formula(stackloss ~ airflow + watertemp + acidcond), stackloss)
regresult = cm97(reg)
betas = regresult["betas"]
betas_in_article = [-37.00, 0.84, 0.63, -0.11]
@test isapprox(betas, betas_in_article, atol=0.01)
end

0 comments on commit f4254b6

Please sign in to comment.