read/write changes, document errors, minor regtable improvements

Gkreindler · Dec 31, 2023 · 9ad96dd · 9ad96dd
1 parent a45841c
commit 9ad96dd
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 42 deletions.
diff --git a/README.md b/README.md
@@ -73,11 +73,23 @@ myfit = GMMTools.fit(MYDATA, mom_fn, theta0, mode=:twostep, opts=myopts)
 ```
 
 ## Writing and reading results
-`GMMTools.fit(...)` saves the estimation results in two files: `fit.csv` contains a table with one row per set of initial conditions, and `fit.json` contains several estimation parameters and results. `GMMTools.write(myfit::GMMFit, opts::GMMOptions, filename)` is the lower level function to write `GMMFit` objects to files.
+`GMMTools.fit(...)` saves the estimation results in two files: `fit.csv` contains a table with one row per set of initial conditions, and `fit.json` contains several estimation parameters and results. `GMMTools.write(myfit::GMMFit, opts::GMMOptions; subpath="fit")` is the lower level function to write `GMMFit` objects to files.
 
-`GMMTools.vcov_simple(...)` saves a `vcov.json` file that includes, among other objects, the variance-covariance matrix `myfit.vcov.V`. `GMMTools.vcov_bboot(...)` also saves two files `vcov_boot_fits_df.csv` (all individual runs for all bootstrap runs) and `vcov_boot_weights.csv` (rows are bootstrap runs, columns are data observations). The lower level function is `GMMTools.write(myvcov::GMMvcov, opts::GMMOptions)`.
+`GMMTools.vcov_simple(...)` saves a `vcov.json` file that includes, among other objects, the variance-covariance matrix `myfit.vcov.V`. `GMMTools.vcov_bboot(...)` also saves two files `vcov_boot_fits_df.csv` (all individual runs for all bootstrap runs) and `vcov_boot_weights.csv` (rows are bootstrap runs, columns are data observations). The lower level function is `GMMTools.write(myvcov::GMMvcov, opts::GMMOptions; subpath="vcov")`.
+
+`GMMTools.read_fit(full_path; subpath="fit", show_trace=false)` reads estimation results and loads them into a `GMMFit` object. `GMMTools.read_vcov(full_path; subpath="vcov", show_trace=false)` reads vcov results and loads them into a `GMMvcov` object. Note that `GMMTools.read_fit()` attempts to also read the vcov from the same folder. Otherwise, read the vcov separately and attach it using 
+```julia
+myfit = GMMTools.read_fit(mypath1)
+myfit.vcov = GMMTools.read_vcov(mypath2)
+```
+
+## Capturing errors
+By default, any error during optimization stops the entire estimation (or inference) command.
+
+Set the `GMMOptions` field `throw_errors=false` to capture these errors and write them to file, but not interrupt the rest of the estimation procedure.
+- when using multiple initial conditions, all iterations that error are recorded in `myfit.fits_df` with `errored=true` and `obj_value=Inf`. If all iterations error, we have `myfit.errored=true` and several other fields are recorded as `missing`
+- for bootstrap results, similar rules apply. Note that inference objects (SEs, vcov, etc.) are computed dropping the bootstrap runs that errored. `@warn` messages should be displayed to remind the user that this is happenening. It is the user's responsibility to ensure this behavior is ok for their use case.
 
-`GMMTools.read_fit(opts::GMMOptions; filepath="")` reads estimation results and loads them into a `GMMFit` object. `GMMTools.read_vcov(opts::GMMOptions; filepath="")` reads vcov results and loads them into a `GMMvcov` object.
 
 # Package To-do list
 

diff --git a/examples/example_ols_2step.jl b/examples/example_ols_2step.jl
@@ -55,13 +55,9 @@ end
     # estimate model
     myfit = GMMTools.fit(df, ols_moments_fn, theta0, mode=:twostep, opts=myopts)
 
-    myfit.theta_hat
-    myfit.fits_df
-    # ols_moments_fn(df, [0.06364138660614915, 0.2604202723600453])
-sdfd
-
-    myopts.path *= "step1/"
-    myfit2 = GMMTools.read_fit(myopts)
+    # read fit from file
+    mypath = "C:/git-repos/GMMTools.jl/examples/temp/step2/"
+    myfit2 = GMMTools.read_fit(mypath)
 
 ### using Optim.jl
     # estimation options
@@ -78,11 +74,16 @@ sdfd
     # estimate model
     myfit = GMMTools.fit(df, ols_moments_fn, theta0, mode=:twostep, opts=myopts);
 
+
 # compute asymptotic variance-covariance matrix and save in myfit.vcov
-    vcov_simple(df, ols_moments_fn, myfit)
+    vcov_simple(df, ols_moments_fn, myfit, opts=myopts)
 
+    GMMTools.write(myfit.vcov, myopts.path)
 
-    GMMTools.write(myfit.vcov, myopts)
+    # read vcov from file
+    mypath = "C:/git-repos/GMMTools.jl/examples/temp/"
+    myvcov2 = GMMTools.read_vcov(mypath)
+    myfit.vcov = myvcov2
 
 # print table with results
     regtable(myfit) |> display
@@ -102,7 +103,7 @@ sdfd
 
 
 # read vcov with bootstrop from file
-    myfit.vcov = GMMTools.read_vcov(myopts)
+    myfit.vcov = GMMTools.read_vcov(myopts.path);
     regtable(myfit) |> display
 
     # using Plots
@@ -113,5 +114,4 @@ sdfd
     # vcov_bboot(df, ols_moments_fn, theta0, myfit, boot_weights=:cluster, cluster_var=:cylinders, nboot=500, opts=myopts)
     # myfit.vcov
 
-    # GMMTools.regtable(myfit)
-
+    # GMMTools.regtable(myfit)
diff --git a/examples/example_regtable.jl b/examples/example_regtable.jl
@@ -0,0 +1,41 @@
+using Pkg
+Pkg.activate(".")
+Pkg.resolve()
+Pkg.instantiate()
+
+using Revise
+using CSV
+using DataFrames
+using FixedEffectModels # for benchmarking
+using RegressionTables
+
+using GMMTools
+
+# load data, originally from: https://www.kaggle.com/datasets/uciml/autompg-dataset/?select=auto-mpg.csv 
+    df = CSV.read("examples/auto-mpg.csv", DataFrame)
+    df[!, :constant] .= 1.0
+
+
+# Run plain OLS for comparison
+    reg_ols = reg(df, term(:mpg) ~ term(:acceleration))
+    regtable(reg_ols)
+
+    # read fit from file
+    mypath = "C:/git-repos/GMMTools.jl/examples/temp/step2/"
+    myfit = GMMTools.read_fit(mypath)
+
+    # read vcov from file
+    mypath = "C:/git-repos/GMMTools.jl/examples/temp/"
+    myfit.vcov = GMMTools.read_vcov(mypath)
+
+    myfit.theta_names = ["(Intercept)", "acceleration"]
+
+# print table with results (combine both OLS and GMM results)
+    # want to use a new label for the estimator: doesn't work
+    # RegressionTables.label_distribution(render::AbstractRenderType, d::GMMTools.GMMFit) = "GMM"
+    # default: print_estimator_section = false
+
+    # mylabels = Dict("theta_1" => "(Intercept)", "theta_2" => "acceleration")
+    regtable(reg_ols, myfit, render = AsciiTable(), below_statistic = ConfInt) |> display
+
+
diff --git a/src/functions_estimation.jl b/src/functions_estimation.jl
@@ -4,8 +4,8 @@ Base.@kwdef mutable struct GMMOptions
     path::String = ""                       # path to save results
     write_iter::Bool = false    # write to file each result (each initial run)
     clean_iter::Bool = false    # delete individual run files at the end of the estimation
-    overwrite::Bool = false     # overwrite existing results file and individual run files
-    throw_errors::Bool = false  # throw optimization errors (if false, save them to file but continue with the other runs)
+    overwrite::Bool = true      # overwrite existing results file and individual run files
+    throw_errors::Bool = true   # throw optimization errors (if false, save them to file but continue with the other runs)
 
     optimizer::Symbol = :optim              # optimizer backend: :optim or :lsqfit (LM)
     optim_algo = LBFGS()                    # Optim.jl algorithm
@@ -14,7 +14,6 @@ Base.@kwdef mutable struct GMMOptions
     theta_lower = nothing                   # nothing or vector of lower bounds
     theta_upper = nothing                   # nothing or vector of upper bounds
 
-
     theta_names::Union{Vector{String}, Nothing} = nothing  # names of parameters 
 
     trace::Integer = 0
@@ -54,7 +53,7 @@ Base.@kwdef mutable struct GMMFit
     time_it_took::Union{Float64, Missing}
 
     # results from multiple initial conditions (DataFrame)
-    fits_df = nothing # TODO: switch to PrettyTables.jl and OrderedDict
+    fits_df = nothing
     idx = nothing # aware of which iteration number this is
 
     # variance covariance matrix
@@ -251,7 +250,7 @@ function fit_twostep(
         fit_step2 = deepcopy(fit_step1)
         fit_step2.mode = :twostep2
         # save results to file?
-        (opts.path != "") && write(fit_step2, opts)
+        (opts.path != "") && write(fit_step2, opts.path)
 
         return fit_step2
     end
@@ -358,7 +357,7 @@ function fit_onestep(
     stats_at_theta_hat(best_model_fit, data, mom_fn)
 
     # save results to file?
-    (opts.path != "") && write(best_model_fit, opts)
+    (opts.path != "") && write(best_model_fit, opts.path)
 
     # delete all intermediate files with individual iteration results
     opts.clean_iter && clean_iter(opts)
@@ -408,7 +407,7 @@ function fit_onerun(
     # write intermediate results to file
     if opts.write_iter 
         (opts.trace > 0) && println(" Done and done writing to file.")
-        write(model_fit, opts, subpath="__iter__/results_" * string(idx)) # this does not contain moms_hat (good, saves space)
+        write(model_fit, opts.path, subpath="__iter__/results_" * string(idx)) # this does not contain moms_hat (good, saves space)
     else
         (opts.trace > 0) && println(" Done. ")
     end

diff --git a/src/functions_inference.jl b/src/functions_inference.jl
@@ -44,11 +44,24 @@ function jacobian(data, mom_fn::Function, myfit::GMMFit)
 
 end
 
-function vcov_simple(data, mom_fn::Function, myfit::GMMFit)
+function vcov_simple(
+    data, 
+    mom_fn::Function, 
+    myfit::GMMFit; 
+    opts::GMMOptions=default_gmm_opts()
+    )
 
     # cannot compute if estimation errored
     myfit.errored && error("Cannot compute vcov_simple because estimation errored.") 
 
+    # try to read from file
+    myvcov = read_fit(opts.path)
+    if !isnothing(myvcov) && myvcov.method == :simple
+        # TODO: use opts.overwrite option to error if vcov already exists but is not :simple
+        myfit.vcov = myvcov
+        return
+    end
+
     # jacobian
     J = jacobian(data, mom_fn, myfit)
 
@@ -75,6 +88,9 @@ function vcov_simple(data, mom_fn::Function, myfit::GMMFit)
         Σ = Σ,
         ses = sqrt.(diag(V)))
 
+    # save results to file?
+    (opts.path != "") && write(myfit.vcov, opts.path)
+
     return
 end
 
@@ -259,6 +275,14 @@ function vcov_bboot(
     run_parallel=true,
     opts::GMMOptions=default_gmm_opts())
 
+    # try to read from file
+    myvcov = read_fit(opts.path)
+    if !isnothing(myvcov) && myvcov.method == :bayesian_bootstrap
+        # TODO: use opts.overwrite option to error if vcov already exists but is not bayesian_bootstrap
+        myfit.vcov = myvcov
+        return
+    end
+
     # copy options so we can modify them (trace and path)
     opts = deepcopy(opts)
 
@@ -324,7 +348,7 @@ function vcov_bboot(
     )
 
     # save results to file?
-    (opts.path != "") && write(myfit.vcov, opts)
+    (opts.path != "") && write(myfit.vcov, opts.path)
 
     # delete all intermediate files with individual iteration results
     if opts.clean_iter 
@@ -337,7 +361,7 @@ function vcov_bboot(
         end
     end
 
-    return boot_fits
+    return 
 end
 
 function bboot(

diff --git a/src/functions_regtable.jl b/src/functions_regtable.jl
@@ -48,8 +48,8 @@ has_fe(m::GMMRegModel) = false
 # RegressionTables.replace_name(x::Tuple{Vararg{Any}}, a::Dict{String, String}, b::Dict{String, String}) = [RegressionTables.replace_name(x[i], a, b) for i=1:length(x)]
 # RegressionTables.formula(m::GMMModel) = term(m.responsename) ~ sum(term.(String.(m.coefnames)))
 
-RegressionTables._responsename(x::GMMRegModel) = string(responsename(x))
-RegressionTables._coefnames(x::GMMRegModel) = string.(coefnames(x))
+RegressionTables._responsename(x::GMMRegModel) = RegressionTables.CoefName(string(responsename(x)))
+RegressionTables._coefnames(x::GMMRegModel) = RegressionTables.CoefName.(string.(coefnames(x)))
 
 StatsAPI.coef(m::GMMRegModel) = m.coef
 StatsAPI.coefnames(m::GMMRegModel) = m.coefnames
@@ -109,7 +109,7 @@ function GMMRegModel(r::GMMFit)
         fe=DataFrame(),
         fekeys=[],
         coefnames=r.theta_names,
-        responsename="a",
+        responsename="", # no column header
         # formula::FormulaTerm        # Original formula
         # formula_schema::FormulaTerm # Schema for predict
         contrasts=Dict(),
@@ -133,7 +133,7 @@ function RegressionTables.regtable(rrs::Vararg{Union{RegressionModel, GMMFit}};
 
     rrs_converted = [isa(r, GMMFit) ? GMMRegModel(r) : r for r=rrs]
 
-    return RegressionTables.regtable(rrs_converted..., kwargs...)
+    return RegressionTables.regtable(rrs_converted...; kwargs...)
 end
 
 

diff --git a/src/io.jl b/src/io.jl
@@ -45,13 +45,12 @@ end
 Write GMMFit object to files: JSON for most fields + CSV for fits_df table
 All paths should exist.
 """
-function write(myfit::GMMFit, opts::GMMOptions; subpath="fit")
+function write(myfit::GMMFit, full_path; subpath="fit")
 
     # paths
-    if opts.path == "" 
+    if full_path == "" 
         return
-    end
-    full_path = opts.path
+    end    
     (full_path[end] == '/') && (full_path *= '/') # ? platform issues?
     full_path *= subpath
 
@@ -87,12 +86,11 @@ end
 
 ### Variance-covariance results
 
-function write(myvcov::GMMvcov, opts::GMMOptions; subpath="vcov")
+function write(myvcov::GMMvcov, full_path; subpath="vcov")
 
-    if opts.path == ""
+    if full_path == ""
         return
     end
-    full_path = opts.path
     (full_path[end] == '/') && (full_path *= '/') # ? platform issues?
     full_path *= subpath
 
@@ -196,11 +194,10 @@ function dict2fit(myfit_dict)
 end
 
 """
-filepath should not include the extension (.csv or .json)
+Example: fit object saved under "C:/temp/fit.json" and "C:/temp/fit.csv". Then call `read_fit("C:/temp/")`.
 """
-function read_fit(opts::GMMOptions; subpath="fit", show_trace=false)
+function read_fit(full_path; subpath="fit", show_trace=false)
 
-    full_path = opts.path
     (full_path[end] == '/') && (full_path *= '/') # ? platform issues?
     full_path *= subpath
 
@@ -225,13 +222,19 @@ function read_fit(opts::GMMOptions; subpath="fit", show_trace=false)
         myfit.moms_hat = readdlm(full_path * "_moms_hat.csv", ',', Float64)
     end
 
+    @info "Read fit from file from " * full_path 
+
+    # try to automatically read vcov object (myfit.vcov = nothing if this fails)
+    myfit.vcov = read_vcov(full_path, show_trace=show_trace)
+
     return myfit
 end
 
-
-function read_vcov(opts::GMMOptions; subpath="vcov", show_trace=false)
+"""
+Example: vcov object saved under "C:/temp/vcov.json". Then call `read_vcov("C:/temp/")`.
+"""
+function read_vcov(full_path; subpath="vcov", show_trace=false)
 
-    full_path = opts.path
     (full_path[end] == '/') && (full_path *= '/') # ? platform issues?
     full_path *= subpath
 
@@ -286,5 +289,7 @@ function read_vcov(opts::GMMOptions; subpath="vcov", show_trace=false)
             boot_moms_hat_df = boot_moms_hat_df)
     end
 
+    @info "Read vcov type [" * string(myvcov.method) * "] from file from " * full_path 
+
     return myvcov
 end