diff --git a/Project.toml b/Project.toml index eaa979e..ff8b554 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ -authors = ["Mehmet Hakan Satman "] name = "LinRegOutliers" uuid = "6d4de0fb-32d9-4c65-aac1-cc9ed8b94b1a" +authors = ["Mehmet Hakan Satman "] version = "0.1.0" [deps] @@ -8,4 +8,15 @@ Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" + + +[compat] +julia = "^1.4" +Clustering = "^0.14.1" +DataFrames = "^0.21.6" +Distributions = "^0.23.9" +GLM = "^1.3.10" +StatsBase = "^0.33.0" + diff --git a/src/LinRegOutliers.jl b/src/LinRegOutliers.jl index 092fd3c..ae6c9a3 100644 --- a/src/LinRegOutliers.jl +++ b/src/LinRegOutliers.jl @@ -5,6 +5,7 @@ using DataFrames using Distributions using Clustering using StatsBase +using LinearAlgebra include("basis.jl") include("data.jl") @@ -14,6 +15,7 @@ include("ks89.jl") include("smr98.jl") include("lms.jl") include("lts.jl") +include("mve.jl") # Essentials from other packages export @formula, DataFrame @@ -41,5 +43,5 @@ export ks89 export smr98 export lms export lts - +export mve end # module diff --git a/src/mve.jl b/src/mve.jl new file mode 100644 index 0000000..8fe7115 --- /dev/null +++ b/src/mve.jl @@ -0,0 +1,80 @@ +function mahalabonisSquaredMatrix(data::DataFrame; meanvector=nothing, covmatrix=nothing)::Array{Float64,2} + datamat = convert(Matrix, data) + if meanvector === nothing + meanvector = colwise(mean, data) + end + if covmatrix === nothing + covmatrix = cov(datamat) + end + try + invm = inv(covmatrix) + MD2 = (datamat .- meanvector') * invm * (datamat .- meanvector')' + return MD2 + catch e + n = size(datamat)[1] + return zeros(Float64, (n, n)) + end +end + +function enlargesubset(initialsubset, data::DataFrame, dataMatrix::Matrix, h::Int) + n, p = size(dataMatrix) + indices = collect(1:n) + basicsubset = copy(initialsubset) + while length(basicsubset) < h + meanvector = colwise(mean, data[basicsubset,:]) + covmatrix = cov(dataMatrix[basicsubset, :]) + md2mat = mahalabonisSquaredMatrix(data, meanvector=meanvector, covmatrix=covmatrix) + md2 = diag(md2mat) + md2sortedindex = sortperm(md2) + basicsubset = md2sortedindex[1:(length(basicsubset) + 1)] + end + return basicsubset +end + +function mve(data::DataFrame; alpha=0.05) + dataMatrix = convert(Matrix, data) + n, p = size(dataMatrix) + chisquared = Chisq(p) + chisqcrit = quantile(chisquared, 1 - alpha) + c = sqrt(chisqcrit) + h = Int(floor((n + p + 1.0) / 2.0)) + indices = collect(1:n) + k = p + 1 + mingoal = Inf + bestinitialsubset = [] + besthsubset = [] + maxiter = minimum([p * 500, 3000]) + initialsubset = [] + hsubset = [] + for iter in 1:maxiter + goal = Inf + try + initialsubset = sample(indices, k, replace=false) + hsubset = enlargesubset(initialsubset, data, dataMatrix, h) + covmatrix = cov(dataMatrix[hsubset, :]) + meanvector = colwise(mean, data[hsubset, :]) + md2mat = mahalabonisSquaredMatrix(data, meanvector=meanvector, covmatrix=covmatrix) + DJ = sqrt(sort(diag(md2mat))[h]) + goal = (DJ / c)^p * det(covmatrix) + catch e + # Possibly singularity + end + if goal < mingoal + mingoal = goal + bestinitialsubset = initialsubset + besthsubset = hsubset + end + end + meanvector = colwise(mean, data[besthsubset, :]) + covariancematrix = cov(dataMatrix[besthsubset, :]) + md2 = diag(mahalabonisSquaredMatrix(data, meanvector=meanvector, covmatrix=covariancematrix)) + outlierset = filter(x -> md2[x] > chisqcrit, 1:n) + result = Dict() + result["goal"] = mingoal + result["best.subset"] = sort(besthsubset) + result["robust.location"] = meanvector + result["robust.covariance"] = covariancematrix + result["squared.mahalanobis"] = md2 + result["outliers"] = outlierset + return result +end \ No newline at end of file diff --git a/test/Project.toml b/test/Project.toml index d570966..1c52673 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -3,3 +3,5 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + + diff --git a/test/runtests.jl b/test/runtests.jl index 77af828..e1be293 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -150,3 +150,14 @@ end @test 21 in outset end +@testset "MVE - Algorithm - Phone data" begin + df = phones + outset = mve(df)["outliers"] + @test 15 in outset + @test 16 in outset + @test 17 in outset + @test 18 in outset + @test 19 in outset + @test 20 in outset + @test 21 in outset +end