From 7d17172161efe6ab5139ee24498066aefe187079 Mon Sep 17 00:00:00 2001 From: "mhsatman@gmail.com" Date: Sun, 23 Jul 2023 18:01:32 +0300 Subject: [PATCH] replace Array{Float64, x} with Vector and Matrix --- CHANGELOG.md | 6 +++- src/basis.jl | 2 +- src/bch.jl | 2 +- src/ccf.jl | 4 +-- src/data.jl | 12 ++++---- src/dataimage.jl | 4 +-- src/diagnostics.jl | 62 +++++++++++++++++++-------------------- src/hadi1992.jl | 4 +-- src/hadi1994.jl | 2 +- src/lad.jl | 4 +-- src/lta.jl | 4 +-- src/lts.jl | 5 ---- src/ols.jl | 16 +++++----- src/py95.jl | 2 +- src/quantileregression.jl | 4 +-- src/smr98.jl | 4 +-- src/theilsen.jl | 6 ++-- 17 files changed, 71 insertions(+), 72 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 835bff9..f3c2443 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,11 @@ # v0.10.2 (Upcoming Release) - mahalanobisSquaredBetweenPairs() return Union{Nothing, Matrix} depending on the determinant of the covariance matrix -- mahalanobisSquaredMatrix return Union{Nothing, Matrix} depending on the determinant of the covariance matrix +- mahalanobisSquaredMatrix() returns Union{Nothing, Matrix} depending on the determinant of the covariance matrix +- import in DataImages fixed. +- Array{Float64, 1} is replaced by Vector{Float64}. +- Array{Float64, 2} is replaced by Matrix{Float64}. +- Use of try/catch reduced, many tries were depending on singularities. # v0.10.1 diff --git a/src/basis.jl b/src/basis.jl index 6cf5540..bb34f09 100644 --- a/src/basis.jl +++ b/src/basis.jl @@ -305,7 +305,7 @@ end Return minimum of numbers greater than zero. # Arguments -- `arr::Array{Float64, 1}`: A function that takes a one dimensional array as argument. +- `arr::Vector{Float64}`: A function that takes a one dimensional array as argument. # Example ```julia-repl diff --git a/src/bch.jl b/src/bch.jl index 8b0c1b2..d3ef472 100644 --- a/src/bch.jl +++ b/src/bch.jl @@ -163,7 +163,7 @@ function bch( # Algorithm 3 - Fitting squared_normalized_robust_distances = (newd .^ 2.0) / sum(newd .^ 2.0) md = median(newd) - newdmd = Array{Float64, 1}(undef, n) + newdmd = Vector{Float64}(undef, n) for i in 1:n newdmd[i] = newd[i] / maximum([newd[i], md]) end diff --git a/src/ccf.jl b/src/ccf.jl index c01583e..8b39fbc 100644 --- a/src/ccf.jl +++ b/src/ccf.jl @@ -79,8 +79,8 @@ end Perform signed gradient descent for clipped convex functions for a given regression setting. # Arguments -- `X::Array{Float64, 2}`: Design matrix of the linear model. -- `y::Array{Float64, 1}`: Response vector of the linear model. +- `X::Matrix{Float64}`: Design matrix of the linear model. +- `y::Vector{Float64}`: Response vector of the linear model. - `starting_lambdas::Array{Float64,1}`: Starting values of weighting parameters used by signed gradient descent. - `alpha::Float64`: Loss at which a point is labeled as an outlier. If unspecified, will be chosen as p*mean(residuals.^2), where residuals are OLS residuals. - `p::Float64`: Points that have squared OLS residual greater than p times the mean squared OLS residual are considered outliers. diff --git a/src/data.jl b/src/data.jl index 9c8600a..ffce699 100644 --- a/src/data.jl +++ b/src/data.jl @@ -980,9 +980,9 @@ const woodgravity = DataFrame( Scottish Hill Races Data # Components -- `dist::Array{Float64, 1}`: Distance in miles (Independent). -- `climb::Array{Float64, 1}`: Heights in feet (Independent). -- `time::Array{Float64, 1}`: Record times in hours (Dependent). +- `dist::Vector{Float64}`: Distance in miles (Independent). +- `climb::Vector{Float64}`: Heights in feet (Independent). +- `time::Vector{Float64}`: Record times in hours (Dependent). # Model time ~ dist + climb @@ -1111,9 +1111,9 @@ const hills = DataFrame( Soft Drink Delivery Data # Components -- `cases::Array{Float64, 1}`: Independent variable. -- `distance::Array{Float64, 1}`: Independent variable. -- `time::Array{Float64, 1}`: Dependent variable. +- `cases::Vector{Float64}`: Independent variable. +- `distance::Vector{Float64}`: Independent variable. +- `time::Vector{Float64}`: Dependent variable. # Model time ~ distance + cases diff --git a/src/dataimage.jl b/src/dataimage.jl index 2619f02..960f499 100644 --- a/src/dataimage.jl +++ b/src/dataimage.jl @@ -3,7 +3,7 @@ module DataImage export dataimage import ..Diagnostics: - mahalanobisSquaredMatrix, euclideanDistances, SquaredBetweenPairs + mahalanobisSquaredMatrix, euclideanDistances, mahalanobisSquaredBetweenPairs import ..RGBX @@ -14,7 +14,7 @@ import ..RGBX Generate the Marchette & Solka (2003) data image for a given data matrix. # Arguments -- `dataMatrix::Array{Float64, 1}`: Data matrix with dimensions n x p, where n is the number of observations and p is the number of variables. +- `dataMatrix::Vector{Float64}`: Data matrix with dimensions n x p, where n is the number of observations and p is the number of variables. - `distance::Symbol`: Optional argument for the distance function. # Notes diff --git a/src/diagnostics.jl b/src/diagnostics.jl index 753057c..6212f8a 100644 --- a/src/diagnostics.jl +++ b/src/diagnostics.jl @@ -33,7 +33,7 @@ import DataFrames: DataFrame Calculate Euclidean distances between pairs. # Arguments -- `dataMatrix::Array{Float64, 1}`: Data matrix with dimensions n x p, where n is the number of observations and p is the number of variables. +- `dataMatrix::Vector{Float64}`: Data matrix with dimensions n x p, where n is the number of observations and p is the number of variables. # Notes This is the helper function for the dataimage() function defined in Marchette & Solka (2003). @@ -42,7 +42,7 @@ Calculate Euclidean distances between pairs. Marchette, David J., and Jeffrey L. Solka. "Using data images for outlier detection." Computational Statistics & Data Analysis 43.4 (2003): 541-552. """ -function euclideanDistances(dataMatrix::Array{Float64, 2})::Array{Float64, 2} +function euclideanDistances(dataMatrix::Matrix{Float64})::Matrix{Float64} n, _ = size(dataMatrix) d = zeros(Float64, n, n) for i ∈ 1:n @@ -90,7 +90,7 @@ end Calculate Mahalanobis distances between pairs. # Arguments -- `dataMatrix::Array{Float64, 1}`: Data matrix with dimensions n x p, where n is the number of observations and p is the number of variables. +- `dataMatrix::Vector{Float64}`: Data matrix with dimensions n x p, where n is the number of observations and p is the number of variables. # Notes Differently from Mahalabonis distances, this function calculates Mahalanobis distances between @@ -101,7 +101,7 @@ Calculate Mahalanobis distances between pairs. Marchette, David J., and Jeffrey L. Solka. "Using data images for outlier detection." Computational Statistics & Data Analysis 43.4 (2003): 541-552. """ -function mahalanobisBetweenPairs(dataMatrix::Array{Float64, 2})::Union{Nothing, Matrix} +function mahalanobisBetweenPairs(dataMatrix::Matrix{Float64})::Union{Nothing, Matrix} n, _ = size(dataMatrix) @@ -136,7 +136,7 @@ end Return vector of medians of each column in a matrix. # Arguments -- `datamat::Array{Float64, 2}`: A matrix. +- `datamat::Matrix{Float64}`: A matrix. # Example ```julia-repl @@ -152,7 +152,7 @@ julia> coordinatwisemedians(mat) 4.0 ``` """ -function coordinatwisemedians(datamat::Array{Float64, 2})::Array{Float64, 1} +function coordinatwisemedians(datamat::Matrix{Float64})::Vector{Float64} _, p = size(datamat) meds = map(i -> median(datamat[:, i]), 1:p) return meds @@ -199,7 +199,7 @@ function dffit(setting::RegressionSetting, i::Int)::Float64 return dffit(X, y, i) end -function dffit(X::Array{Float64, 2}, y::Array{Float64, 1}, i::Int)::Float64 +function dffit(X::Matrix{Float64}, y::Vector{Float64}, i::Int)::Float64 n, _ = size(X) indices = [j for j ∈ 1:n if i != j] olsfull = ols(X, y) @@ -255,13 +255,13 @@ julia> dffits(reg) Belsley, David A., Edwin Kuh, and Roy E. Welsch. Regression diagnostics: Identifying influential data and sources of collinearity. Vol. 571. John Wiley & Sons, 2005. """ -function dffits(setting::RegressionSetting)::Array{Float64, 1} +function dffits(setting::RegressionSetting)::Vector{Float64} n, _ = size(setting.data) result = [dffit(setting, i) for i ∈ 1:n] return result end -function dffits(X::Array{Float64, 2}, y::Array{Float64, 1})::Array{Float64, 1} +function dffits(X::Matrix{Float64}, y::Vector{Float64})::Vector{Float64} n, _ = size(X) result = [dffit(X, y, i) for i ∈ 1:n] return result @@ -283,12 +283,12 @@ julia> size(hatmatrix(reg)) (24, 24) """ -function hatmatrix(setting::RegressionSetting)::Array{Float64, 2} +function hatmatrix(setting::RegressionSetting)::Matrix{Float64} X = designMatrix(setting) return hatmatrix(X) end -function hatmatrix(X::Array{Float64, 2})::Array{Float64, 2} +function hatmatrix(X::Matrix{Float64})::Matrix{Float64} return X * inv(X'X) * X' end @@ -332,12 +332,12 @@ julia> studentizedResiduals(reg) -1.529459974327181 ``` """ -function studentizedResiduals(setting::RegressionSetting)::Array{Float64, 1} +function studentizedResiduals(setting::RegressionSetting)::Vector{Float64} X, y = @extractRegressionSetting setting return studentizedResiduals(X, y) end -function studentizedResiduals(X::Array{Float64, 2}, y::Array{Float64, 1})::Array{Float64, 1} +function studentizedResiduals(X::Matrix{Float64}, y::Vector{Float64})::Vector{Float64} olsreg = ols(X, y) n, p = size(X) e = residuals(olsreg) @@ -388,13 +388,13 @@ julia> adjustedResiduals(reg) -85.9914301855088 ``` """ -function adjustedResiduals(setting::RegressionSetting)::Array{Float64, 1} +function adjustedResiduals(setting::RegressionSetting)::Vector{Float64} X, y = @extractRegressionSetting setting return adjustedResiduals(X, y) end -function adjustedResiduals(X::Array{Float64, 2}, y::Array{Float64, 1})::Array{Float64, 1} +function adjustedResiduals(X::Matrix{Float64}, y::Vector{Float64})::Vector{Float64} olsreg = ols(X, y) n, _ = size(X) e = residuals(olsreg) @@ -429,7 +429,7 @@ function jacknifedS(setting::RegressionSetting, k::Int)::Float64 return jacknifedS(X, y, k) end -function jacknifedS(X::Array{Float64, 2}, y::Array{Float64, 1}, k::Int)::Float64 +function jacknifedS(X::Matrix{Float64}, y::Vector{Float64}, k::Int)::Float64 n, p = size(X) indices = [i for i ∈ 1:n if i != k] Xsub = X[indices, :] @@ -486,12 +486,12 @@ julia> cooks(reg) Cook, R. Dennis. "Detection of influential observation in linear regression." Technometrics 19.1 (1977): 15-18. """ -function cooks(setting::RegressionSetting)::Array{Float64, 1} +function cooks(setting::RegressionSetting)::Vector{Float64} X, y = @extractRegressionSetting setting return cooks(X, y) end -function cooks(X::Array{Float64, 2}, y::Array{Float64, 1})::Array{Float64, 1} +function cooks(X::Matrix{Float64}, y::Vector{Float64})::Vector{Float64} n, p = size(X) olsreg = ols(X, y) res = residuals(olsreg) @@ -526,7 +526,7 @@ function cooksoutliers(setting::RegressionSetting; alpha::Float64 = 0.5)::Dict return cooksoutliers(X, y, alpha = alpha) end -function cooksoutliers(X::Array{Float64, 2}, y::Array{Float64, 1}; alpha::Float64 = 0.5)::Dict +function cooksoutliers(X::Matrix{Float64}, y::Vector{Float64}; alpha::Float64 = 0.5)::Dict n, p = size(X) d = cooks(X, y) cutoff = cookscritical(n, p) @@ -542,8 +542,8 @@ Calculate Mahalanobis distances. # Arguments - `data::DataFrame`: A DataFrame object of the multivariate data. -- `meanvector::Array{Float64, 1}`: Optional mean vector of variables. -- `covmatrix::Array{Float64, 2}`: Optional covariance matrix of data. +- `meanvector::Vector{Float64}`: Optional mean vector of variables. +- `covmatrix::Matrix{Float64}`: Optional covariance matrix of data. # References Mahalanobis, Prasanta Chandra. "On the generalized distance in statistics." @@ -553,7 +553,7 @@ function mahalanobisSquaredMatrix( data::DataFrame; meanvector = nothing, covmatrix = nothing, -)::Union{Nothing, Array{Float64, 2}} +)::Union{Nothing, Matrix{Float64}} datamat = Matrix(data) return mahalanobisSquaredMatrix(datamat, meanvector = meanvector, covmatrix = covmatrix) end @@ -563,7 +563,7 @@ function mahalanobisSquaredMatrix( datamat::Matrix; meanvector = nothing, covmatrix = nothing, -)::Union{Nothing, Array{Float64, 2}} +)::Union{Nothing, Matrix{Float64}} if isnothing(meanvector) meanvector = applyColumns(mean, datamat) end @@ -598,7 +598,7 @@ function dfbetas(setting) return mapreduce(permutedims, vcat, results) end -function dfbetas(X::Array{Float64, 2}, y::Array{Float64, 1}) +function dfbetas(X::Matrix{Float64}, y::Vector{Float64}) results = map(i -> dfbeta(X, y, i), 1:length(y)) return mapreduce(permutedims, vcat, results) end @@ -622,16 +622,16 @@ julia> dfbeta(setting, 1) -0.14686166007904422 ``` """ -function dfbeta(setting::RegressionSetting, omittedIndex::Int)::Array{Float64, 1} +function dfbeta(setting::RegressionSetting, omittedIndex::Int)::Vector{Float64} X, y = @extractRegressionSetting setting return dfbeta(X, y, omittedIndex) end function dfbeta( - X::Array{Float64, 2}, - y::Array{Float64, 1}, + X::Matrix{Float64}, + y::Vector{Float64}, omittedIndex::Int, -)::Array{Float64, 1} +)::Vector{Float64} n = length(y) omittedindices = filter(x -> x != omittedIndex, 1:n) regfull = ols(X, y) @@ -663,7 +663,7 @@ function covratio(setting::RegressionSetting, omittedIndex::Int) return covratio(X, y, omittedIndex) end -function covratio(X::Array{Float64, 2}, y::Array{Float64, 1}, omittedIndex::Int) +function covratio(X::Matrix{Float64}, y::Vector{Float64}, omittedIndex::Int) n, p = size(X) reg = ols(X, y) r = residuals(reg) @@ -709,7 +709,7 @@ function hadimeasure(setting::RegressionSetting; c::Float64 = 2.0) hadimeasure(X, y, c = c) end -function hadimeasure(X::Array{Float64, 2}, y::Array{Float64, 1}; c::Float64 = 2.0) +function hadimeasure(X::Matrix{Float64}, y::Vector{Float64}; c::Float64 = 2.0) n, p = size(X) reg = ols(X, y) res = residuals(reg) @@ -769,7 +769,7 @@ function diagnose(setting::RegressionSetting; alpha = 0.5) end -function diagnose(X::Array{Float64, 2}, y::Array{Float64, 1}; alpha = 0.5) +function diagnose(X::Matrix{Float64}, y::Vector{Float64}; alpha = 0.5) n, p = size(X) resultdffits = dffits(X, y) resultdfbetas = dfbetas(X, y) diff --git a/src/hadi1992.jl b/src/hadi1992.jl index 3ff1156..50b57a2 100644 --- a/src/hadi1992.jl +++ b/src/hadi1992.jl @@ -22,7 +22,7 @@ import Distributions: Chisq Perform the sub-algorithm of handling singularity defined in Hadi (1992). # Arguments -- `S::Array{Float64, 2}`: A covariance matrix. +- `S::Matrix{Float64}`: A covariance matrix. # Reference Hadi, Ali S. "Identifying multiple outliers in multivariate data." @@ -48,7 +48,7 @@ end Perform Hadi (1992) algorithm for a given multivariate data. # Arguments -- `multivariateData::Array{Float64, 2}`: Multivariate data. +- `multivariateData::Matrix{Float64}`: Multivariate data. # Description Algorithm starts with an initial subset and enlarges the subset to diff --git a/src/hadi1994.jl b/src/hadi1994.jl index 3fc38cc..ce93773 100644 --- a/src/hadi1994.jl +++ b/src/hadi1994.jl @@ -20,7 +20,7 @@ import Distributions: Chisq Perform Hadi (1994) algorithm for a given multivariate data. # Arguments -- `multivariateData::Array{Float64, 2}`: Multivariate data. +- `multivariateData::Matrix{Float64}`: Multivariate data. # Description Algorithm starts with an initial subset and enlarges the subset to diff --git a/src/lad.jl b/src/lad.jl index e49f3ae..833003e 100644 --- a/src/lad.jl +++ b/src/lad.jl @@ -68,8 +68,8 @@ end Perform Least Absolute Deviations regression for a given regression setting. # Arguments -- `X::Array{Float64, 2}`: Design matrix of the linear model. -- `y::Array{Float64, 1}`: Response vector of the linear model. +- `X::Matrix{Float64}`: Design matrix of the linear model. +- `y::Vector{Float64}`: Response vector of the linear model. - `exact::Bool`: If true, use exact LAD regression. If false, estimate LAD regression parameters using GA. Default is true. """ function lad(X::Array{Float64,2}, y::Array{Float64,1}; exact::Bool = true) diff --git a/src/lta.jl b/src/lta.jl index 5483b75..bf4c0d0 100644 --- a/src/lta.jl +++ b/src/lta.jl @@ -66,8 +66,8 @@ Perform the Hawkins & Olive (1999) algorithm (Least Trimmed Absolute Deviations) for the given regression setting. # Arguments -- `X::Array{Float64, 2}`: Design matrix of linear regression model. -- `y::Array{Float64, 1}`: Response vector of linear regression model. +- `X::Matrix{Float64}`: Design matrix of linear regression model. +- `y::Vector{Float64}`: Response vector of linear regression model. - `exact::Bool`: Consider all possible subsets of p or not where p is the number of regression parameters. - `earlystop::Bool`: Early stop if the best objective does not change in number of remaining iters / 5 iterations. diff --git a/src/lts.jl b/src/lts.jl index 44dda84..6da394a 100644 --- a/src/lts.jl +++ b/src/lts.jl @@ -47,7 +47,6 @@ function iterateCSteps( maxiter::Int = 10000 eps::Float64 = 0.1 while iter < maxiter - #try olsreg = ols(X[subsetindices, :], y[subsetindices]) betas = coef(olsreg) res = y - X * betas @@ -59,10 +58,6 @@ function iterateCSteps( end oldobjective = objective iter += 1 - #catch er - # @warn er - # return (objective, subsetindices) - #end end #if iter >= maxiter # @warn "in c-step stage of LTS, a h-subset is not converged for starting indices " starterset diff --git a/src/ols.jl b/src/ols.jl index 40192cd..086c1e8 100644 --- a/src/ols.jl +++ b/src/ols.jl @@ -17,9 +17,9 @@ import ..Basis: Immutable data structure that holds design matrix, response vector, and estimated regression parameters. # Arguments -- `X::Array{Float64, 2}`: Design matrix. -- `y::Array{Float64, 1}`: Response vector. -- `betas::Array{Float64, 1}`: Regression coefficients. +- `X::Matrix{Float64}`: Design matrix. +- `y::Vector{Float64}`: Response vector. +- `betas::Vector{Float64}`: Regression coefficients. """ struct OLS @@ -35,8 +35,8 @@ end Create OLS object with estimated regression coefficients. # Arguments -- `X::Array{Float64, 2}`: Design matrix. -- `y::Array{Float64, 1}`: Response vector. +- `X::Matrix{Float64}`: Design matrix. +- `y::Vector{Float64}`: Response vector. # Examples ```julia-repl @@ -67,9 +67,9 @@ end Estimate weighted least squares regression and create OLS object with estimated parameters. # Arguments -- `X::Array{Float64, 2}`: Design matrix. -- `y::Array{Float64, 1}`: Response vector. -- `wts::Array{Float64, 1}`: Weights vector. +- `X::Matrix{Float64}`: Design matrix. +- `y::Vector{Float64}`: Response vector. +- `wts::Vector{Float64}`: Weights vector. # Examples diff --git a/src/py95.jl b/src/py95.jl index 0283a35..778ebe9 100644 --- a/src/py95.jl +++ b/src/py95.jl @@ -19,7 +19,7 @@ import LinearAlgebra: eigen Process eigen vectors of EDHDE matrix as defined in the Pena & Yohai (1995) algorithm. # Arguments -- `v::Array{Float64, 1}`: Eigen vector of EDHDE matrix. +- `v::Vector{Float64}`: Eigen vector of EDHDE matrix. # References Peña, Daniel, and Victor J. Yohai. "The detection of influential subsets in linear diff --git a/src/quantileregression.jl b/src/quantileregression.jl index 2025847..13be674 100644 --- a/src/quantileregression.jl +++ b/src/quantileregression.jl @@ -62,8 +62,8 @@ end Estimates parameters of linear regression using Quantile Regression Estimator for a given regression setting. # Arguments -- `X::Array{Float64, 2}`: Design matrix of the linear model. -- `y::Array{Float64, 1}`: Response vector of the linear model. +- `X::Matrix{Float64}`: Design matrix of the linear model. +- `y::Vector{Float64}`: Response vector of the linear model. - `tau::Float64`: Quantile level. Default is 0.5. diff --git a/src/smr98.jl b/src/smr98.jl index eacd6d3..c2131c3 100644 --- a/src/smr98.jl +++ b/src/smr98.jl @@ -79,8 +79,8 @@ end Perform the Sebert, Monthomery and Rollier (1998) algorithm for the given regression setting. # Arguments -- `X::Array{Float64, 2}`: Desing matrix of the linear regression model. -- `y::Array{Float64, 1}`: Response vector of the linear regression model. +- `X::Matrix{Float64}`: Desing matrix of the linear regression model. +- `y::Vector{Float64}`: Response vector of the linear regression model. # References diff --git a/src/theilsen.jl b/src/theilsen.jl index bd31a5a..1465a8d 100644 --- a/src/theilsen.jl +++ b/src/theilsen.jl @@ -34,7 +34,7 @@ function theilsen(setting::RegressionSetting, m::Int; nsamples::Int = 5000) return theilsen(X, y, m, nsamples = nsamples) end -function theilsen(X::Array{Float64, 2}, y::Array{Float64, 1}, m::Int; nsamples::Int = 5000) +function theilsen(X::Matrix{Float64}, y::Vector{Float64}, m::Int; nsamples::Int = 5000) n, p = size(X) @@ -42,7 +42,7 @@ function theilsen(X::Array{Float64, 2}, y::Array{Float64, 1}, m::Int; nsamples:: error("m must be in the range [p, n]") end - allbetas = Array{Float64, 2}(undef, nsamples, p) + allbetas = Matrix{Float64}(undef, nsamples, p) for i in 1:nsamples luckyindices = sample(1:n, m, replace = false) @@ -55,7 +55,7 @@ function theilsen(X::Array{Float64, 2}, y::Array{Float64, 1}, m::Int; nsamples:: return Dict("betas" => multimed) end -function multivariatemedian(X::Array{Float64, 2}) +function multivariatemedian(X::Matrix{Float64}) n, p = size(X) function dist(x::Vector{Float64}, y::Vector{Float64})