Skip to content

Commit

Permalink
mahalanobisSquaredMatrix return Union{Nothing, Matrix} depending on t…
Browse files Browse the repository at this point in the history
…he determinant of the covariance matrix
  • Loading branch information
jbytecode committed Jul 23, 2023
1 parent ad08671 commit 6ccc31b
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 75 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# v0.10.2 (Upcoming Release)

- mahalanobisSquaredBetweenPairs() return Union{Nothing, Matrix} depending on the determinant of the covariance matrix

- mahalanobisSquaredMatrix return Union{Nothing, Matrix} depending on the determinant of the covariance matrix


# v0.10.1
Expand Down
38 changes: 27 additions & 11 deletions src/bacon.jl
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ function initial_basic_subset_multivariate_data(
)
n, _ = size(X)
if method == "mahalanobis"
distances = sqrt.(diag(mahalanobisSquaredMatrix(X)))
msm = mahalanobisSquaredMatrix(X)
@assert !isnothing(msm)
distances = sqrt.(diag(msm))
elseif method == "median"
median_vector = applyColumns(median, X)
distances = [norm(X[i, :] - median_vector, 2) for i = 1:n]
Expand Down Expand Up @@ -108,16 +110,12 @@ function bacon_multivariate_outlier_detection(
while (r_prev != r)
mean_basic_subset = mean(X[subset], dims = 1)
cov_basic_subset = X[subset]'X[subset]
distances =
sqrt.(
diag(
mahalanobisSquaredMatrix(
X,
meanvector = mean_basic_subset,
covmatrix = cov_basic_subset,
),
)
)

msm = mahalanobisSquaredMatrix(X, meanvector = mean_basic_subset, covmatrix = cov_basic_subset)

@assert !isnothing(msm)

distances = sqrt.(diag(msm))
c_hr = (h - r) / (h + r)
c_hr = c_hr < 0 ? 0 : c_hr
c_npr = c_hr + c_np
Expand Down Expand Up @@ -149,13 +147,21 @@ This function computes the t distance for each point and returns the distance ve
- `subset`: The vector which denotes the points inside the subset, used to scale the residuals accordingly.
"""
function compute_t_distance(X::Array{Float64,2}, y::Array{Float64}, subset::Array{Int64})

n, p = size(X)

t = zeros(Float64, n)

least_squares_fit = ols(X[subset, :], y[subset])

betas = coef(least_squares_fit)

err = residuals(least_squares_fit)

sigma = sqrt((err'err) / (n - p))

covmatrix_inv = inv(X[subset, :]'X[subset, :])

for i = 1:n
scale_factor = (X[i, :]') * (covmatrix_inv * X[i, :])
residual = (y[i] - X[i, :]' * betas)
Expand All @@ -165,6 +171,7 @@ function compute_t_distance(X::Array{Float64,2}, y::Array{Float64}, subset::Arra
t[i] = residual / (sigma * sqrt(1 + scale_factor))
end
end

return abs.(t)
end

Expand Down Expand Up @@ -195,6 +202,7 @@ function bacon_regression_initial_subset(
method = method,
alpha = alpha,
)["distances"]

initial_subset = select_subset(X, m, distances)

t = compute_t_distance(X, y, initial_subset)
Expand Down Expand Up @@ -257,12 +265,17 @@ function bacon(
method::String = "mahalanobis",
alpha = 0.025,
)::Dict

n, p = size(X)

subset = bacon_regression_initial_subset(X, y, m, method = method, alpha = alpha)

r_prev = 0

r = length(subset)

iter = 0

while (r != r_prev)
t = compute_t_distance(X, y, subset)
tdist = TDist(r - p)
Expand All @@ -278,10 +291,13 @@ function bacon(

outlierindices = setdiff(1:n, subset)
inlierindices = subset

cleanols = ols(X[inlierindices, :], y[inlierindices])

cleanbetas = coef(cleanols)

result = Dict("outliers" => outlierindices, "betas" => cleanbetas)

return result
end

Expand Down
39 changes: 18 additions & 21 deletions src/bch.jl
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ function bch(
maxiter = 1000,
epsilon = 0.000001,
)::Dict

n, p = size(Xdesign)
h = Int(floor((n + p + 1.0) / 2))
X = Xdesign
Expand All @@ -102,26 +103,23 @@ function bch(
# Algorithm 2 - Step 0.a
coordmeds = coordinatwisemedians(X)
A = ((X .- coordmeds')' * (X .- coordmeds')) / (n - 1.0)
dsquared = diag(
mahalanobisSquaredMatrix(
X,
meanvector = coordmeds,
covmatrix = A,
),
)

msm = mahalanobisSquaredMatrix(X, meanvector = coordmeds, covmatrix = A)
@assert !isnothing(msm)

dsquared = diag(msm)
d = sqrt.(dsquared)

# Algorithm 2 - Step 0.b
bestindicesofd = sortperm(d)[1:h]
colmeansofh = map(i -> mean(X[bestindicesofd, i]), 1:p)
covmatofh = cov(X[bestindicesofd, :])
newdsquared = diag(
mahalanobisSquaredMatrix(
X,
meanvector = colmeansofh,
covmatrix = covmatofh,
),
)

msm2 = mahalanobisSquaredMatrix(X, meanvector = colmeansofh, covmatrix = covmatofh)
@assert !isnothing(msm2)

newdsquared = diag(msm2)

newd = sqrt.(newdsquared)

# Algorithm 2 - Steps 1, 2, 3
Expand Down Expand Up @@ -150,13 +148,10 @@ function bch(
r = length(basicsubsetindices)
colmeanofbasicsubset = map(i -> mean(X[basicsubsetindices, i]), 1:p)
covmatofbasicsubset = cov(X[basicsubsetindices, :])
newdsquared = diag(
mahalanobisSquaredMatrix(
X,
meanvector = colmeanofbasicsubset,
covmatrix = covmatofbasicsubset,
),
)

msm4 = mahalanobisSquaredMatrix(X, meanvector = colmeanofbasicsubset, covmatrix = covmatofbasicsubset)
@assert !isnothing(msm4)
newdsquared = diag(msm4)
newd = sqrt.(newdsquared)
sortednewdsquared = sort(newdsquared)
if sortednewdsquared[r + 1] >= crit
Expand All @@ -180,6 +175,7 @@ function bch(
betas = Float64[]
squared_normalized_resids = Float64[]
resids = Float64[]

for i = 1:maxiter
wols = wls(Xdesign, y, weights)
betas = coef(wols)
Expand All @@ -191,6 +187,7 @@ function bch(
weights = (a .^ 2.0) / (sum(a .^ 2.0))
estimatedvariance = n * c * sum(resids .^ 2) / (n - p + 1)
push!(estimatedvariances, estimatedvariance)

if length(estimatedvariances) > 2
if abs(estimatedvariance - estimatedvariances[end-1]) < epsilon ||
abs(estimatedvariance - estimatedvariances[end-2]) < epsilon
Expand Down
4 changes: 2 additions & 2 deletions src/dataimage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@ module DataImage
export dataimage

import ..Diagnostics:
mahalanobisSquaredMatrix, euclideanDistances, mahalanobisSquaredBetweenPairs
mahalanobisSquaredMatrix, euclideanDistances, SquaredBetweenPairs

import ..RGBX

"""
dataimage(dataMatrix; distance = :mahalanobis)
dataimage(dataMatrix; distance = :)
Generate the Marchette & Solka (2003) data image for a given data matrix.
Expand Down
31 changes: 17 additions & 14 deletions src/diagnostics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ function mahalanobisSquaredBetweenPairs(pairs::Matrix; covmatrix = nothing)::Uni
end

invm = inv(covmatrix)

for i 1:n
for j i:n
newmat[i, j] =
Expand Down Expand Up @@ -101,26 +101,29 @@ Calculate Mahalanobis distances between pairs.
Marchette, David J., and Jeffrey L. Solka. "Using data images for outlier detection."
Computational Statistics & Data Analysis 43.4 (2003): 541-552.
"""
function mahalanobisBetweenPairs(dataMatrix::Array{Float64, 2})::Array{Float64, 2}
n, _ = size(dataMatrix)
d = zeros(Float64, n, n)
covmat = cov(dataMatrix)
if det(covmat) == 0.0
@warn "Covariance matrix is singular, mahalanobis distances can not be calculated."
function mahalanobisBetweenPairs(dataMatrix::Array{Float64, 2})::Union{Nothing, Matrix}

n, _ = size(dataMatrix)

d = zeros(Float64, n, n)

covmat = cov(dataMatrix)

if iszero(det(covmat))
return nothing
end
covinv = inv(covmat)
for i 1:n

covinv = inv(covmat)

for i 1:n
for j i:n
if i != j
d[i, j] = sqrt(
(dataMatrix[i, :] .- dataMatrix[j, :]) *
covinv *
(dataMatrix[i, :] .- dataMatrix[j, :])',
)
d[i, j] = sqrt((dataMatrix[i, :] .- dataMatrix[j, :]) * covinv * (dataMatrix[i, :] .- dataMatrix[j, :])')
d[j, i] = d[i, j]
end
end
end

return d
end

Expand Down
37 changes: 20 additions & 17 deletions src/hadi1992.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,15 +85,23 @@ function hadi1992(multivariateData::Array{Float64,2}; alpha = 0.05)
# Step 0
meds = coordinatwisemedians(multivariateData)
Sm = (1.0 / (n - 1.0)) * (multivariateData .- meds')' * (multivariateData .- meds')
mah0 =
diag(mahalanobisSquaredMatrix(multivariateData, meanvector = meds, covmatrix = Sm))

msm1 = mahalanobisSquaredMatrix(multivariateData, meanvector = meds, covmatrix = Sm)
@assert !isnothing(msm1)

mah0 = diag(msm1)

ordering_indices_mah0 = sortperm(mah0)
best_indices_mah0 = ordering_indices_mah0[1:h]
starting_data = multivariateData[best_indices_mah0, :]

Cv = coordinatwisemedians(starting_data)
Sv = (1.0 / (h - 1.0)) * (starting_data .- Cv')' * (starting_data .- Cv')
mah1 = diag(mahalanobisSquaredMatrix(multivariateData, meanvector = Cv, covmatrix = Sv))

msm2 = mahalanobisSquaredMatrix(multivariateData, meanvector = Cv, covmatrix = Sv)
@assert !isnothing(msm2)

mah1 = diag(msm2)
ordering_indices_mah1 = sortperm(mah1)

r = p + 1
Expand All @@ -113,23 +121,18 @@ function hadi1992(multivariateData::Array{Float64,2}; alpha = 0.05)
if det(cfactor * Sb) == 0
@info "singular Sb case"
newSb = hadi1992_handle_singularity(cfactor * Sb)
mah1 = diag(
mahalanobisSquaredMatrix(
multivariateData,
meanvector = Cb,
covmatrix = newSb,
),
)

msm3 = mahalanobisSquaredMatrix(multivariateData, meanvector = Cb, covmatrix = newSb,)
@assert !isnothing(msm3)

mah1 = diag(msm3)

ordering_indices_mah1 = sortperm(mah1)
basic_subset_indices = ordering_indices_mah1[1:r]
else
mah1 = diag(
mahalanobisSquaredMatrix(
multivariateData,
meanvector = Cb,
covmatrix = (cfactor * Sb),
),
)
msm4 = mahalanobisSquaredMatrix(multivariateData, meanvector = Cb, covmatrix = (cfactor * Sb))
@assert !isnothing(msm4)
mah1 = diag(msm4)
ordering_indices_mah1 = sortperm(mah1)
basic_subset_indices = ordering_indices_mah1[1:r]
end
Expand Down
25 changes: 16 additions & 9 deletions src/hadi1994.jl
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,21 @@ function hadi1994(multivariateData::Array{Float64,2}; alpha = 0.05)

meds = coordinatwisemedians(multivariateData)
Sm = (1.0 / (n - 1.0)) * (multivariateData .- meds')' * (multivariateData .- meds')
mah0 =
diag(mahalanobisSquaredMatrix(multivariateData, meanvector = meds, covmatrix = Sm))

msm1 = mahalanobisSquaredMatrix(multivariateData, meanvector = meds, covmatrix = Sm)
@assert !isnothing(msm1)

mah0 = diag(msm1)

ordering_indices_mah0 = sortperm(mah0)
best_indices_mah0 = ordering_indices_mah0[1:h]
starting_data = multivariateData[best_indices_mah0, :]

Cv = coordinatwisemedians(starting_data)
Sv = (1.0 / (h - 1.0)) * (starting_data .- Cv')' * (starting_data .- Cv')
mah1 = diag(mahalanobisSquaredMatrix(multivariateData, meanvector = Cv, covmatrix = Sv))
msm2 = mahalanobisSquaredMatrix(multivariateData, meanvector = Cv, covmatrix = Sv)
@assert !isnothing(msm2)
mah1 = diag(msm2)
ordering_indices_mah1 = sortperm(mah1)

r = p + 1
Expand Down Expand Up @@ -97,13 +103,14 @@ function hadi1994(multivariateData::Array{Float64,2}; alpha = 0.05)
end
end

mah1 = diag(
mahalanobisSquaredMatrix(
multivariateData,
meanvector = Cb,
covmatrix = (cfactor * Sb),
),
msm3 = mahalanobisSquaredMatrix(
multivariateData,
meanvector = Cb,
covmatrix = (cfactor * Sb),
)
@assert !isnothing(msm3)

mah1 = diag(msm3)

ordering_indices_mah1 = sortperm(mah1)
basic_subset_indices = ordering_indices_mah1[1:r]
Expand Down

0 comments on commit 6ccc31b

Please sign in to comment.