Skip to content

Commit

Permalink
new version, compat section, and mve algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
jbytecode committed Aug 24, 2020
1 parent a009025 commit c00da74
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 2 deletions.
13 changes: 12 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,11 +1,22 @@
authors = ["Mehmet Hakan Satman <[email protected]>"]
name = "LinRegOutliers"
uuid = "6d4de0fb-32d9-4c65-aac1-cc9ed8b94b1a"
authors = ["Mehmet Hakan Satman <[email protected]>"]
version = "0.1.0"

[deps]
Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"


[compat]
julia = "^1.4"
Clustering = "^0.14.1"
DataFrames = "^0.21.6"
Distributions = "^0.23.9"
GLM = "^1.3.10"
StatsBase = "^0.33.0"

4 changes: 3 additions & 1 deletion src/LinRegOutliers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ using DataFrames
using Distributions
using Clustering
using StatsBase
using LinearAlgebra

include("basis.jl")
include("data.jl")
Expand All @@ -14,6 +15,7 @@ include("ks89.jl")
include("smr98.jl")
include("lms.jl")
include("lts.jl")
include("mve.jl")

# Essentials from other packages
export @formula, DataFrame
Expand Down Expand Up @@ -41,5 +43,5 @@ export ks89
export smr98
export lms
export lts

export mve
end # module
80 changes: 80 additions & 0 deletions src/mve.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
function mahalabonisSquaredMatrix(data::DataFrame; meanvector=nothing, covmatrix=nothing)::Array{Float64,2}
datamat = convert(Matrix, data)
if meanvector === nothing
meanvector = colwise(mean, data)
end
if covmatrix === nothing
covmatrix = cov(datamat)
end
try
invm = inv(covmatrix)
MD2 = (datamat .- meanvector') * invm * (datamat .- meanvector')'
return MD2
catch e
n = size(datamat)[1]
return zeros(Float64, (n, n))
end
end

function enlargesubset(initialsubset, data::DataFrame, dataMatrix::Matrix, h::Int)
n, p = size(dataMatrix)
indices = collect(1:n)
basicsubset = copy(initialsubset)
while length(basicsubset) < h
meanvector = colwise(mean, data[basicsubset,:])
covmatrix = cov(dataMatrix[basicsubset, :])
md2mat = mahalabonisSquaredMatrix(data, meanvector=meanvector, covmatrix=covmatrix)
md2 = diag(md2mat)
md2sortedindex = sortperm(md2)
basicsubset = md2sortedindex[1:(length(basicsubset) + 1)]
end
return basicsubset
end

function mve(data::DataFrame; alpha=0.05)
dataMatrix = convert(Matrix, data)
n, p = size(dataMatrix)
chisquared = Chisq(p)
chisqcrit = quantile(chisquared, 1 - alpha)
c = sqrt(chisqcrit)
h = Int(floor((n + p + 1.0) / 2.0))
indices = collect(1:n)
k = p + 1
mingoal = Inf
bestinitialsubset = []
besthsubset = []
maxiter = minimum([p * 500, 3000])
initialsubset = []
hsubset = []
for iter in 1:maxiter
goal = Inf
try
initialsubset = sample(indices, k, replace=false)
hsubset = enlargesubset(initialsubset, data, dataMatrix, h)
covmatrix = cov(dataMatrix[hsubset, :])
meanvector = colwise(mean, data[hsubset, :])
md2mat = mahalabonisSquaredMatrix(data, meanvector=meanvector, covmatrix=covmatrix)
DJ = sqrt(sort(diag(md2mat))[h])
goal = (DJ / c)^p * det(covmatrix)
catch e
# Possibly singularity
end
if goal < mingoal
mingoal = goal
bestinitialsubset = initialsubset
besthsubset = hsubset
end
end
meanvector = colwise(mean, data[besthsubset, :])
covariancematrix = cov(dataMatrix[besthsubset, :])
md2 = diag(mahalabonisSquaredMatrix(data, meanvector=meanvector, covmatrix=covariancematrix))
outlierset = filter(x -> md2[x] > chisqcrit, 1:n)
result = Dict()
result["goal"] = mingoal
result["best.subset"] = sort(besthsubset)
result["robust.location"] = meanvector
result["robust.covariance"] = covariancematrix
result["squared.mahalanobis"] = md2
result["outliers"] = outlierset
return result
end
2 changes: 2 additions & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
GLM = "38e38edf-8417-5370-95a0-9cbb8c7f171a"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"


11 changes: 11 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,14 @@ end
@test 21 in outset
end

@testset "MVE - Algorithm - Phone data" begin
df = phones
outset = mve(df)["outliers"]
@test 15 in outset
@test 16 in outset
@test 17 in outset
@test 18 in outset
@test 19 in outset
@test 20 in outset
@test 21 in outset
end

0 comments on commit c00da74

Please sign in to comment.