xKDR · smishr · Jan 22, 2023 · Feb 3, 2023 · Feb 4, 2023 · Feb 5, 2023
diff --git a/Project.toml b/Project.toml
@@ -9,6 +9,7 @@ CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"

diff --git a/src/Survey.jl b/src/Survey.jl
@@ -4,6 +4,7 @@ using DataFrames
 using Statistics
 import Statistics: quantile
 using StatsBase
+using Distributions
 import StatsBase: mean, quantile
 using CSV
 using LinearAlgebra
@@ -26,6 +27,7 @@ include("show.jl")
 include("ratio.jl")
 include("by.jl")
 include("jackknife.jl")
+include("ci.jl")
 
 export load_data
 export AbstractSurveyDesign, SurveyDesign, ReplicateDesign

diff --git a/src/ci.jl b/src/ci.jl
@@ -0,0 +1,18 @@
+"""
+    Calculate confidence intervals for given estimates.
+    Supports normal, margin of error and t-distribution based CI.
+"""
+function _ci(estimate::AbstractVector, se::AbstractVector, type::String, alpha::Float64, dof::Int64, margin::Float64)
+    # Parse type of CI, calc critical value
+    if type == "normal"
+        critical_value = quantile(Normal(),1-alpha/2)
+    elseif type == "margin"
+        critical_value = margin
+    elseif type == "t"
+        critical_value = quantile(TDist(dof),1-alpha/2)
+    end
+    # Calculate upper and lower estimates
+    ci_lower = estimate .- critical_value .* se
+    ci_upper = estimate .+ critical_value .* se
+    return ci_lower, ci_upper
+end
diff --git a/src/mean.jl b/src/mean.jl
@@ -142,3 +142,59 @@ function mean(x::Symbol, domain, design::AbstractSurveyDesign)
     rename!(df, :statistic => :mean)
     return df
 end
+
+"""
+    mean(x, design, ci_type; kwargs)
+    mean(x, domain, design, ci_type; kwargs)
+
+Confidence intervals for `mean`. Three options for ci_type:
+```julia
+ci_type="normal"    # use Normal distribution critical values
+ci_type="t"         # use Student t distribution critical values
+ci_type="margin"    # use margin of error
+```
+
+Mathematically, the confidence interval is the range
+```math
+\\left[\\bar{x} - critical value * SE , \\bar{x} + critical value * SE  \\right]
+```
+
+Keyword arguments for each type of CI
+```julia
+alpha         # Significance level. Confidence level is 100*(1 - alpha)%
+dof           # Degrees of freedom when ci_type="t"
+margin        # Margin of error when ci_type="margin"
+```
+Also works when `Vector{Symbol}` and `domain` are specified.
+```julia
+# TODO example
+```
+## External links
+[Confidence intervals on Wikipedia](https://en.wikipedia.org/wiki/Confidence_interval)
+"""
+function mean(x::Symbol, design::ReplicateDesign, ci_type::String;
+    alpha::Float64=0.05, dof::Int64=nrow(design.data)-1, margin::Float64=2.0)
+    df_mean = mean(x, design)
+    ci_lower, ci_upper = _ci(df_mean[!,1], df_mean[!,2], ci_type, alpha, dof, margin)
+    df_mean[!,:ci_lower] = ci_lower
+    df_mean[!,:ci_upper] = ci_upper
+    return df_mean
+end
+
+function mean(x::Vector{Symbol}, design::ReplicateDesign, ci_type::String;
+    alpha::Float64=0.05, dof::Int64=nrow(design.data)-1, margin::Float64=2.0)
+    df_mean = mean(x, design)
+    ci_lower, ci_upper = _ci(df_mean[!,2], df_mean[!,3], ci_type, alpha, dof, margin) # mean and SE are in 2nd and 3rd columns
+    df_mean[!,:ci_lower] = ci_lower
+    df_mean[!,:ci_upper] = ci_upper
+    return df_mean
+end
+
+function mean(x::Symbol, domain::Symbol, design::ReplicateDesign, ci_type::String; 
+    alpha::Float64=0.05, dof::Int64=nrow(design.data)-1, margin::Float64=2.0)
+    df_mean = mean(x, domain, design)
+    ci_lower, ci_upper = _ci(df_mean[!,2], df_mean[!,3], ci_type, alpha, dof, margin) # domain mean and SE are in 2nd and 3rd columns
+    df_mean[!,:ci_lower] = ci_lower
+    df_mean[!,:ci_upper] = ci_upper
+    return df_mean
+end
diff --git a/test/ci.jl b/test/ci.jl
@@ -0,0 +1,37 @@
+@testset "ci.jl" begin
+    #### Each of the 3 options with default keyword arguments
+    # 95% CI - normal
+    @test mean(:api00, dclus1_boot, "normal").ci_lower[1] ≈ 598.28529 atol = 1e-4
+    # 95% CI, with dof=Infinity - t
+    @test mean(:api00, dclus1_boot, "t").ci_upper[1] ≈ 690.3606 atol = 1e-4
+    # margin of 2 SE
+    @test mean(:api00, dclus1_boot, "margin").ci_upper[1] ≈ 690.99077 atol = 1e-4
+
+    #### Test "normal" keyword options 
+    # 90% CI
+    @test mean(:api00, dclus1_boot, "normal", alpha = 0.1).ci_upper[1] ≈ 682.67655 atol = 1e-4
+    # 85% CI
+    @test mean(:api00, dclus1_boot, "normal", alpha = 0.15).ci_lower[1] ≈ 610.469 atol = 1e-4
+
+    #### Test "t" keyword options
+    #### For illustration purposes, dclus1_boot is actually a 'large' sample
+    # 90% CI
+    @test mean(:api00, dclus1_boot, "t", dof = 30).ci_upper[1] ≈ 691.9804 atol = 1e-4
+    # 85% CI
+    @test mean(:api00, dclus1_boot, "t", alpha = 0.1, dof = 50).ci_lower[1] ≈ 604.9353 atol = 1e-4
+
+    #### Test "t" keyword options
+    # 3 - sigma
+    @test mean(:api00, dclus1_boot, "margin", margin=3.0).ci_upper[1] ≈ 714.40146 atol = 1e-4 
+    # Six-sigma
+    @test mean(:api00, dclus1_boot, "margin", margin=6.0).ci_lower[1] ≈ 503.70526 atol = 1e-4
+
+    #### Test Vector of Symbols
+    @test mean([:api00, :enroll], dclus1_boot, "normal").ci_lower[2] ≈ 459.98174 atol = 1e-4
+    @test mean([:api00, :enroll], dclus1_boot, "normal").ci_upper[2] ≈ 639.44995 atol = 1e-4
+
+    #### Test domain estimation
+    mn = mean(:api00, :cname, dclus1_boot, "normal")
+    @test filter(:cname => ==("Los Angeles"), mn).ci_lower[1] ≈ 553.92680 atol = 1e-4
+    @test filter(:cname => ==("Santa Clara"), mn).ci_upper[1] ≈ 846.17990 atol = 1e-4
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -63,4 +63,5 @@ include("hist.jl")
 include("boxplot.jl")
 include("ratio.jl")
 include("show.jl")
-include("jackknife.jl")
+include("jackknife.jl")
+include("ci.jl")