Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add general CI and integration with mean #190

Draft
wants to merge 18 commits into
base: next_release
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand Down
2 changes: 2 additions & 0 deletions src/Survey.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ using DataFrames
using Statistics
import Statistics: quantile
using StatsBase
using Distributions
import StatsBase: mean, quantile
using CSV
using LinearAlgebra
Expand All @@ -26,6 +27,7 @@ include("show.jl")
include("ratio.jl")
include("by.jl")
include("jackknife.jl")
include("ci.jl")

export load_data
export AbstractSurveyDesign, SurveyDesign, ReplicateDesign
Expand Down
18 changes: 18 additions & 0 deletions src/ci.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
"""
Calculate confidence intervals for given estimates.
Supports normal, margin of error and t-distribution based CI.
"""
function _ci(estimate::AbstractVector, se::AbstractVector, type::String, alpha::Float64, dof::Int64, margin::Float64)
# Parse type of CI, calc critical value
if type == "normal"
critical_value = quantile(Normal(),1-alpha/2)
elseif type == "margin"
critical_value = margin
elseif type == "t"
critical_value = quantile(TDist(dof),1-alpha/2)
end
# Calculate upper and lower estimates
ci_lower = estimate .- critical_value .* se
ci_upper = estimate .+ critical_value .* se
return ci_lower, ci_upper
end
56 changes: 56 additions & 0 deletions src/mean.jl
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,59 @@ function mean(x::Symbol, domain, design::AbstractSurveyDesign)
rename!(df, :statistic => :mean)
return df
end

"""
mean(x, design, ci_type; kwargs)
mean(x, domain, design, ci_type; kwargs)

Confidence intervals for `mean`. Three options for ci_type:
```julia
ci_type="normal" # use Normal distribution critical values
ci_type="t" # use Student t distribution critical values
ci_type="margin" # use margin of error
```

Mathematically, the confidence interval is the range
```math
\\left[\\bar{x} - critical value * SE , \\bar{x} + critical value * SE \\right]
```

Keyword arguments for each type of CI
```julia
alpha # Significance level. Confidence level is 100*(1 - alpha)%
dof # Degrees of freedom when ci_type="t"
margin # Margin of error when ci_type="margin"
```
Also works when `Vector{Symbol}` and `domain` are specified.
```julia
# TODO example
```
## External links
[Confidence intervals on Wikipedia](https://en.wikipedia.org/wiki/Confidence_interval)
"""
function mean(x::Symbol, design::ReplicateDesign, ci_type::String;
alpha::Float64=0.05, dof::Int64=nrow(design.data)-1, margin::Float64=2.0)
df_mean = mean(x, design)
ci_lower, ci_upper = _ci(df_mean[!,1], df_mean[!,2], ci_type, alpha, dof, margin)
df_mean[!,:ci_lower] = ci_lower
df_mean[!,:ci_upper] = ci_upper
return df_mean
end

function mean(x::Vector{Symbol}, design::ReplicateDesign, ci_type::String;
alpha::Float64=0.05, dof::Int64=nrow(design.data)-1, margin::Float64=2.0)
df_mean = mean(x, design)
ci_lower, ci_upper = _ci(df_mean[!,2], df_mean[!,3], ci_type, alpha, dof, margin) # mean and SE are in 2nd and 3rd columns
df_mean[!,:ci_lower] = ci_lower
df_mean[!,:ci_upper] = ci_upper
return df_mean
end

function mean(x::Symbol, domain::Symbol, design::ReplicateDesign, ci_type::String;
alpha::Float64=0.05, dof::Int64=nrow(design.data)-1, margin::Float64=2.0)
df_mean = mean(x, domain, design)
ci_lower, ci_upper = _ci(df_mean[!,2], df_mean[!,3], ci_type, alpha, dof, margin) # domain mean and SE are in 2nd and 3rd columns
df_mean[!,:ci_lower] = ci_lower
df_mean[!,:ci_upper] = ci_upper
return df_mean
end
37 changes: 37 additions & 0 deletions test/ci.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
@testset "ci.jl" begin
#### Each of the 3 options with default keyword arguments
# 95% CI - normal
@test mean(:api00, dclus1_boot, "normal").ci_lower[1] ≈ 598.28529 atol = 1e-4
# 95% CI, with dof=Infinity - t
@test mean(:api00, dclus1_boot, "t").ci_upper[1] ≈ 690.3606 atol = 1e-4
# margin of 2 SE
@test mean(:api00, dclus1_boot, "margin").ci_upper[1] ≈ 690.99077 atol = 1e-4

#### Test "normal" keyword options
# 90% CI
@test mean(:api00, dclus1_boot, "normal", alpha = 0.1).ci_upper[1] ≈ 682.67655 atol = 1e-4
# 85% CI
@test mean(:api00, dclus1_boot, "normal", alpha = 0.15).ci_lower[1] ≈ 610.469 atol = 1e-4

#### Test "t" keyword options
#### For illustration purposes, dclus1_boot is actually a 'large' sample
# 90% CI
@test mean(:api00, dclus1_boot, "t", dof = 30).ci_upper[1] ≈ 691.9804 atol = 1e-4
# 85% CI
@test mean(:api00, dclus1_boot, "t", alpha = 0.1, dof = 50).ci_lower[1] ≈ 604.9353 atol = 1e-4

#### Test "t" keyword options
# 3 - sigma
@test mean(:api00, dclus1_boot, "margin", margin=3.0).ci_upper[1] ≈ 714.40146 atol = 1e-4
# Six-sigma
@test mean(:api00, dclus1_boot, "margin", margin=6.0).ci_lower[1] ≈ 503.70526 atol = 1e-4

#### Test Vector of Symbols
@test mean([:api00, :enroll], dclus1_boot, "normal").ci_lower[2] ≈ 459.98174 atol = 1e-4
@test mean([:api00, :enroll], dclus1_boot, "normal").ci_upper[2] ≈ 639.44995 atol = 1e-4

#### Test domain estimation
mn = mean(:api00, :cname, dclus1_boot, "normal")
@test filter(:cname => ==("Los Angeles"), mn).ci_lower[1] ≈ 553.92680 atol = 1e-4
@test filter(:cname => ==("Santa Clara"), mn).ci_upper[1] ≈ 846.17990 atol = 1e-4
end
3 changes: 2 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,4 +63,5 @@ include("hist.jl")
include("boxplot.jl")
include("ratio.jl")
include("show.jl")
include("jackknife.jl")
include("jackknife.jl")
include("ci.jl")