Skip to content

Commit

Permalink
Add Bootstrap Regression (Linear Regression)
Browse files Browse the repository at this point in the history
  • Loading branch information
ShouvikGhosh2048 committed Feb 24, 2023
1 parent 33b5780 commit c6f2f50
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 2 deletions.
10 changes: 9 additions & 1 deletion src/CRRao.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,14 @@ where
"""
struct PoissonRegression end

"""
```julia
Boot_Residual
```
Type representing Residual Bootstrap.
"""
struct Boot_Residual end

"""
```julia
Prior_Gauss
Expand Down Expand Up @@ -384,7 +392,7 @@ end

Cauchit() = Cauchit(Cauchit_Link)

export LinearRegression, LogisticRegression, PoissonRegression, NegBinomRegression
export LinearRegression, LogisticRegression, PoissonRegression, NegBinomRegression, Boot_Residual
export Prior_Ridge, Prior_Laplace, Prior_Cauchy, Prior_TDist, Prior_HorseShoe, Prior_Gauss
export CRRaoLink, Logit, Probit, Cloglog, Cauchit, fit
export coef, coeftable, r2, adjr2, loglikelihood, aic, bic, sigma, predict, residuals, cooksdistance, BPTest, pvalue
Expand Down
89 changes: 89 additions & 0 deletions src/frequentist/linear_regression.jl
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,92 @@ function fit(formula::FormulaTerm, data::DataFrame, modelClass::LinearRegression
model = lm(formula, data; kwargs...)
return FrequentistRegression(:LinearRegression, model, formula)
end

"""
```julia
fit(formula::FormulaTerm, data::DataFrame, modelClass::LinearRegression, bootstrap::Boot_Residual, sim_size::Int64 = 1000)
```
Fit a Bootstrap Regression model on the input data. Uses the [lm](https://juliastats.org/GLM.jl/stable/api/#GLM.lm) method from the [GLM](https://github.com/JuliaStats/GLM.jl) package under the hood. Returns an object of type `DataFrame`.
# Example
```julia-repl
julia> using CRRao, RDatasets, StableRNGs, StatsModels
julia> df = dataset("datasets", "mtcars")
32×12 DataFrame
Row │ Model MPG Cyl Disp HP DRat WT QSec VS AM Gear Carb
│ String31 Float64 Int64 Float64 Int64 Float64 Float64 Float64 Int64 Int64 Int64 Int64
─────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────
1 │ Mazda RX4 21.0 6 160.0 110 3.9 2.62 16.46 0 1 4 4
2 │ Mazda RX4 Wag 21.0 6 160.0 110 3.9 2.875 17.02 0 1 4 4
3 │ Datsun 710 22.8 4 108.0 93 3.85 2.32 18.61 1 1 4 1
4 │ Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
5 │ Hornet Sportabout 18.7 8 360.0 175 3.15 3.44 17.02 0 0 3 2
6 │ Valiant 18.1 6 225.0 105 2.76 3.46 20.22 1 0 3 1
⋮ │ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮ ⋮
27 │ Porsche 914-2 26.0 4 120.3 91 4.43 2.14 16.7 0 1 5 2
28 │ Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.9 1 1 5 2
29 │ Ford Pantera L 15.8 8 351.0 264 4.22 3.17 14.5 0 1 5 4
30 │ Ferrari Dino 19.7 6 145.0 175 3.62 2.77 15.5 0 1 5 6
31 │ Maserati Bora 15.0 8 301.0 335 3.54 3.57 14.6 0 1 5 8
32 │ Volvo 142E 21.4 4 121.0 109 4.11 2.78 18.6 1 1 4 2
20 rows omitted
julia> CRRao.set_rng(StableRNG(123))
StableRNGs.LehmerRNG(state=0x000000000000000000000000000000f7)
julia> container = fit(@formula(MPG ~ HP + WT + Gear), df, LinearRegression(), Boot_Residual())
4×5 DataFrame
Row │ Predictor Coef Std Error Lower 5% Upper 95%
│ String Float64 Float64 Float64 Float64
─────┼─────────────────────────────────────────────────────────────
1 │ (Intercept) 32.1309 4.57528 24.8024 39.9568
2 │ HP -0.0364971 0.00962225 -0.0519917 -0.0201571
3 │ WT -3.22576 0.834607 -4.61517 -1.80358
4 │ Gear 1.00012 0.842335 -0.429382 2.35324
```
"""
function fit(
formula::FormulaTerm,
data::DataFrame,
modelClass::LinearRegression,
bootstrap::Boot_Residual,
sim_size::Int64 = 1000)

formula = apply_schema(formula, schema(formula, data), RegressionModel)
y, X = modelcols(formula, data)

model = lm(formula, data)
res = coeftable(model)
res = DataFrame(res)
e = residuals(model)

β_hat = coef(model)
p = length(β_hat)
n = size(X)[1]

## This line is inefficient
A = vcov(model)/dispersion(model.model,true)

## Once the Mousum's code is merged we will revert to following line
# A = GLM.inv(model)

β_star_matrix = zeros(sim_size,p)

for b in 1:sim_size
e_resample = rand(CRRao_rng, e, n)
β_star = β_hat+A*X'e_resample
β_star_matrix[b,:] = β_star
end

bootstrap_coef_table = zeros(p,4)
bootstrap_coef_table[:,1] = mean(β_star_matrix, dims=1)
bootstrap_coef_table[:,2] = std(β_star_matrix, dims=1)
for j in 1:p
bootstrap_coef_table[j,3] = quantile(β_star_matrix[:,j], 0.05)
bootstrap_coef_table[j,4] = quantile(β_star_matrix[:,j], 0.95)
end
col_names = ["Coef", "Std Error", "Lower 5%", "Upper 95%"]
bootstrap_coeftable = DataFrame(bootstrap_coef_table, col_names)
row_names = res[!,1]
bootstrap_coeftable = insertcols!(bootstrap_coeftable, 1, :Predictor =>row_names)

return bootstrap_coeftable
end
6 changes: 5 additions & 1 deletion test/numerical/frequentist/LinearRegression.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,8 @@ for (test_formula, test_aic, test_bic, test_pvalue) in tests
bp_test = BPTest(crrao_model, mtcars)
@test isapprox(pvalue(bp_test), test_pvalue)
end
end
end

CRRao.set_rng(StableRNG(123))
container = fit(@formula(MPG ~ HP + WT + Gear), mtcars, LinearRegression(), Boot_Residual())
@test isapprox(container.Coef, [32.13093064426382, -0.036497081866302794, -3.2257634531003574, 1.0001169309164597])

0 comments on commit c6f2f50

Please sign in to comment.