From f4254b694666e7f1ada4da013c5f5331374ae6fe Mon Sep 17 00:00:00 2001 From: jbytecode Date: Wed, 28 Oct 2020 21:35:15 +0300 Subject: [PATCH] =?UTF-8?q?implement=20Chatterjee=20and=20M=C3=A4chler=20(?= =?UTF-8?q?1997)=20algorithm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Manifest.toml | 28 +++++++-------- Project.toml | 2 +- README.md | 2 +- src/.dev/outlier.jl | 5 ++- src/.dev/test.jl | 10 ++++++ src/LinRegOutliers.jl | 6 +++- src/cm97.jl | 81 +++++++++++++++++++++++++++++++++++++++++++ src/data.jl | 26 +++++++++++++- src/ols.jl | 5 ++- test/runtests.jl | 10 ++++++ 10 files changed, 153 insertions(+), 22 deletions(-) create mode 100644 src/cm97.jl diff --git a/Manifest.toml b/Manifest.toml index 278fb8a..c1c038b 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -8,9 +8,9 @@ version = "2.3.0" [[ArrayInterface]] deps = ["LinearAlgebra", "Requires", "SparseArrays"] -git-tree-sha1 = "920136b6b8ae5bd28a3c45d68e55421dec156d7f" +git-tree-sha1 = "bd09109dffaa3926a20178cb8432edd729be0db0" uuid = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" -version = "2.13.6" +version = "2.13.7" [[Artifacts]] deps = ["Pkg"] @@ -70,9 +70,9 @@ version = "0.3.0" [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "b4f242d5e9f65648d438a03d245a9fdc6e38d728" +git-tree-sha1 = "a706ff10f1cd8dab94f59fd09c0e657db8e77ff0" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.22.0" +version = "3.23.0" [[CompilerSupportLibraries_jll]] deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] @@ -99,9 +99,9 @@ version = "0.21.8" [[DataStructures]] deps = ["Compat", "InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "db07bb22795762895b60e44d62b34b16c982a687" +git-tree-sha1 = "fb0aa371da91c1ff9dc7fbed6122d3e411420b9c" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.18.7" +version = "0.18.8" [[DataValueInterfaces]] git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" @@ -264,9 +264,9 @@ uuid = "82899510-4779-5014-852e-03e436cf321d" version = "1.0.0" [[JLLWrappers]] -git-tree-sha1 = "7cec881362e5b4e367ff0279dd99a06526d51a55" +git-tree-sha1 = "c70593677bbf2c3ccab4f7500d0f4dacfff7b75c" uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.1.2" +version = "1.1.3" [[JSON]] deps = ["Dates", "Mmap", "Parsers", "Unicode"] @@ -329,9 +329,9 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" [[MbedTLS]] deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"] -git-tree-sha1 = "426a6978b03a97ceb7ead77775a1da066343ec6e" +git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe" uuid = "739be429-bea8-5141-9913-cc70e7f3736d" -version = "1.0.2" +version = "1.0.3" [[MbedTLS_jll]] deps = ["Libdl", "Pkg"] @@ -441,9 +441,9 @@ version = "1.0.7" [[Plots]] deps = ["Base64", "Contour", "Dates", "FFMPEG", "FixedPointNumbers", "GR", "GeometryBasics", "GeometryTypes", "JSON", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs"] -git-tree-sha1 = "61ece2ce87bbacbea01d3752aef1af89409d6a55" +git-tree-sha1 = "4d94e6b49fbb0c479e71e12aa1446c4053f5a25e" uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" -version = "1.7.1" +version = "1.7.3" [[PooledArrays]] deps = ["DataAPI"] @@ -558,9 +558,9 @@ version = "0.10.3" [[StaticArrays]] deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "016d1e1a00fabc556473b07161da3d39726ded35" +git-tree-sha1 = "da4cf579416c81994afd6322365d00916c79b8ae" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "0.12.4" +version = "0.12.5" [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] diff --git a/Project.toml b/Project.toml index 108586d..f34af24 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "LinRegOutliers" uuid = "6d4de0fb-32d9-4c65-aac1-cc9ed8b94b1a" authors = ["Mehmet Hakan Satman ", "Shreesh Adiga <16567adigashreesh@gmail.com>"] -version = "0.7.0" +version = "0.8.0" [deps] Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" diff --git a/README.md b/README.md index 70e289e..6c786b1 100644 --- a/README.md +++ b/README.md @@ -30,11 +30,11 @@ A Julia package for outlier detection in linear regression. - Atkinson (1994) Forward Search Algorithm - BACON Algorithm (Billor & Hadi & Velleman (2000)) - Hadi (1994) Algorithm +- Chatterjee & Mächler (1997) - Summary ## Unimplemented Methods -- Chatterjee & Mächler (1997) - Depth based estimators (Regression depth, deepest regression, etc.) - Theil & Sen estimator for mutliple regression diff --git a/src/.dev/outlier.jl b/src/.dev/outlier.jl index b2d07b9..7fd5e0e 100644 --- a/src/.dev/outlier.jl +++ b/src/.dev/outlier.jl @@ -65,9 +65,8 @@ include("../atkinsonstalactiteplot.jl") include("../imon2005.jl") include("../ccf.jl") include("../bacon.jl") +include("../cm97.jl") -println("ready") -using BenchmarkTools +println("ready") -reg = createRegressionSetting(@formula(y ~ x1 + x2 + x3), hbk) diff --git a/src/.dev/test.jl b/src/.dev/test.jl index 463e4e0..84b6c1d 100644 --- a/src/.dev/test.jl +++ b/src/.dev/test.jl @@ -667,3 +667,13 @@ end result = bacon(reg, m=12) @test result == [1, 3, 4, 21] end + + +@testset "Chatterjee & Mächler (1997) cm97 with stackloss" begin + df2 = stackloss + reg = createRegressionSetting(@formula(stackloss ~ airflow + watertemp + acidcond), stackloss) + regresult = cm97(reg) + betas = regresult["betas"] + betas_in_article = [-37.00, 0.84, 0.63, -0.11] + @test isapprox(betas, betas_in_article, atol=0.01) +end \ No newline at end of file diff --git a/src/LinRegOutliers.jl b/src/LinRegOutliers.jl index 776a133..5b9f457 100644 --- a/src/LinRegOutliers.jl +++ b/src/LinRegOutliers.jl @@ -114,6 +114,9 @@ include("bacon.jl") # Imon 2005 Algorithm include("imon2005.jl") +#  Chatterjee & Mächler (1997) Algorithm +include("cm97.jl") + # All-in-one include("summary.jl") @@ -132,7 +135,7 @@ export find_minimum_nonzero # Data export phones, hbk, stackloss export weightloss, hs93randomdata, woodgravity -export hills +export hills, softdrinkdelivery # Diagnostics export dffit, dfbeta @@ -172,5 +175,6 @@ export ccf export imon2005 export atkinson94, atkinsonstalactiteplot, generate_stalactite_plot export bacon +export cm97 end # module diff --git a/src/cm97.jl b/src/cm97.jl new file mode 100644 index 0000000..48e5e3d --- /dev/null +++ b/src/cm97.jl @@ -0,0 +1,81 @@ +""" + + cm97(setting; maxiter = 1000) + +Perform the Chatterjee and Mächler (1997) algorithm for the given regression setting. + +# Arguments +- `setting::RegressionSetting`: RegressionSetting object with a formula and dataset. + +# Examples +```julia-repl +julia> myreg = createRegressionSetting(@formula(stackloss ~ airflow + watertemp + acidcond), stackloss) +julia> result = cm97(myreg) +Dict{String,Any} with 3 entries: + "betas" => [-37.0007, 0.839285, 0.632333, -0.113208] + "iterations" => 22 + "converged" => true +``` + +# References +Chatterjee, Samprit, and Martin Mächler. "Robust regression: A weighted least squares approach." +Communications in Statistics-Theory and Methods 26.6 (1997): 1381-1394. +""" +function cm97(setting::RegressionSetting; maxiter::Int=1000) + + X = designMatrix(setting) + y = responseVector(setting) + return cm97(X, y) + +end + + + +function cm97(X::Array{Float64,2}, y::Array{Float64,1}; maxiter::Int=1000) + + n, p = size(X) + pbar::Float64 = p / n + hat = hatmatrix(X) + + w_is = Array{Float64}(undef, n) + betas = Array{Float64}(undef, p) + + converged::Bool = false + iter::Int = 0 + + # initial weights + @inbounds for i in 1:n + w_is[i] = 1.0 / max(hat[i, i], pbar) + end + + while iter <= maxiter + oldbetas = betas + + wols = wls(X, y, w_is) + betas = coef(wols) + r = y - X * betas + medi = median(abs.(r)) + + @inbounds for i in 1:n + w_is[i] = (1.0 - hat[i, i])^2.0 / max(abs(r[i]), medi) + end + + if isapprox(oldbetas, betas) + converged = true + break + end + + iter += 1 + + end + + result = Dict{String,Any}( + "converged" => converged, + "betas" => betas, + "iterations" => iter + ) + + return result +end + + diff --git a/src/data.jl b/src/data.jl index 99d81e6..80ce3a5 100644 --- a/src/data.jl +++ b/src/data.jl @@ -177,4 +177,28 @@ const hills = DataFrame( dist=[2.4, 6.0, 6.0, 7.5, 8.0, 8.0, 16.0, 6.0, 5.0, 6.0, 28.0, 5.0, 9.5, 6.0, 4.5, 10.0, 14.0, 3.0, 4.5, 5.5, 3.0, 3.5, 6.0, 2.0, 3.0, 4.0, 6.0, 5.0, 6.5, 5.0, 10.0, 6.0, 18.0, 4.5, 20.0], climb=[650.0, 2500.0, 900.0, 800.0, 3070.0, 2866.0, 7500.0, 800.0, 800.0, 650.0, 2100.0, 2000.0, 2200.0, 500.0, 1500.0, 3000.0, 2200.0, 350.0, 1000.0, 600.0, 300.0, 1500.0, 2200.0, 900.0, 600.0, 2000.0, 800.0, 950.0, 1750.0, 500.0, 4400.0, 600.0, 5200.0, 850.0, 5000.0], time=[0.268055555555556, 0.805833333333333, 0.560833333333333, 0.76, 1.03777777777778, 1.22027777777778, 3.41027777777778, 0.606111111111111, 0.495833333333333, 0.6625, 3.21111111111111, 0.7175, 1.08333333333333, 0.735555555555556, 0.448888888888889, 1.20416666666667, 1.64027777777778, 1.31083333333333, 0.290277777777778, 0.542777777777778, 0.265833333333333, 0.465, 0.794166666666667, 0.298888888888889, 0.311388888888889, 0.436944444444444, 0.573888888888889, 0.476111111111111, 0.841666666666667, 0.349166666666667, 1.42638888888889, 0.539722222222222, 2.8375, 0.468333333333333, 2.66388888888889] -) \ No newline at end of file +) + + + + +""" + Soft Drink Delivery Data + +# Components +- `cases::Array{Float64, 1}`: Independent variable. +- `distance::Array{Float64, 1}`: Independent variable. +- `time::Array{Float64, 1}`: Dependent variable. + +# Model +time ~ distance + cases + +# Reference +D. C. Montgomery and E. A. Peck (1992) Introduction to Regression Analysis. Wiley, New York. +""" +const softdrinkdelivery = DataFrame( + time=[16.68, 11.5, 12.03, 14.88, 13.75, 18.11, 8.0, 17.83, 79.24, 21.5, 40.33, 21.0, 13.5, 19.75, 24.0, 29.0, 15.35, 19.0, 9.5, 35.1, 17.9, 52.32, 18.75, 19.83, 10.75], + distance=[560.0, 220.0, 340.0, 80.0, 150.0, 330.0, 110.0, 210.0, 1460.0, 605.0, 688.0, 215.0, 255.0, 462.0, 448.0, 776.0, 200.0, 132.0, 36.0, 770.0, 140.0, 810.0, 450.0, 635.0, 150.0], + cases=[7.0, 3.0, 3.0, 4.0, 6.0, 7.0, 2.0, 7.0, 30.0, 5.0, 16.0, 10.0, 4.0, 6.0, 9.0, 10.0, 6.0, 7.0, 3.0, 17.0, 10.0, 26.0, 9.0, 8.0, 4.0] +) + diff --git a/src/ols.jl b/src/ols.jl index 6ead68c..f5b8e89 100644 --- a/src/ols.jl +++ b/src/ols.jl @@ -8,7 +8,10 @@ ols(X::Array{Float64,2}, y::Array{Float64,1})::OLS = OLS(X, y, qr(X, Val(true)) function wls(X::Array{Float64,2}, y::Array{Float64,1}, wts::Array{Float64,1}) W = Diagonal(sqrt.(wts)) - return ols(W*X, W*y) + #  I commented this because passing weighted values of X and y to OLS + #  causes wrong calculations of residuals. + # return ols(W * X, W * y) + return OLS(X, y, qr(W * X, Val(true)) \ (W * y)) end residuals(ols::OLS) = ols.y .- ols.X * ols.betas diff --git a/test/runtests.jl b/test/runtests.jl index 5ad8fbd..0a69550 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -682,3 +682,13 @@ end result = bacon(reg, m=12) @test result == [1, 3, 4, 21] end + +@testset "Chatterjee & Mächler (1997) cm97 with stackloss" begin + df2 = stackloss + reg = createRegressionSetting(@formula(stackloss ~ airflow + watertemp + acidcond), stackloss) + regresult = cm97(reg) + betas = regresult["betas"] + betas_in_article = [-37.00, 0.84, 0.63, -0.11] + @test isapprox(betas, betas_in_article, atol=0.01) +end +