From c51986d2385d658215985780a63d1b4c18446db8 Mon Sep 17 00:00:00 2001 From: Gabriel E Kreindler Date: Thu, 21 Dec 2023 13:52:04 -0500 Subject: [PATCH] two step GMM and testing --- Project.toml | 1 + examples/example_ols_2step.jl | 102 +++++++++++++++++++++++++++++++ examples/temp/step1/Wstep2.csv | 2 + examples/temp/step1/results.csv | 21 +++++++ examples/temp/step2/results.csv | 21 +++++++ src/GMMTools.jl | 3 +- src/functions_estimation.jl | 103 ++++++++++++++++++++++---------- 7 files changed, 220 insertions(+), 33 deletions(-) create mode 100644 examples/example_ols_2step.jl create mode 100644 examples/temp/step1/Wstep2.csv create mode 100644 examples/temp/step1/results.csv create mode 100644 examples/temp/step2/results.csv diff --git a/Project.toml b/Project.toml index a63fafb..9742925 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.1.0" [deps] CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41" diff --git a/examples/example_ols_2step.jl b/examples/example_ols_2step.jl new file mode 100644 index 0000000..9eadc8d --- /dev/null +++ b/examples/example_ols_2step.jl @@ -0,0 +1,102 @@ +using Pkg +Pkg.activate(".") +Pkg.resolve() +Pkg.instantiate() + +using Revise +using LinearAlgebra # for identity matrix "I" +using CSV +using DataFrames +using FixedEffectModels # for benchmarking +using RegressionTables + +using GMMTools +using Optim # need for NewtonTrustRegion() + + +# load data, originally from: https://www.kaggle.com/datasets/uciml/autompg-dataset/?select=auto-mpg.csv + df = CSV.read("examples/auto-mpg.csv", DataFrame) + df[!, :constant] .= 1.0 + + +# Run plain OLS for comparison + r = reg(df, term(:mpg) ~ term(:acceleration)) + RegressionTables.regtable(r) + +# define moments for OLS regression +# residuals orthogonal to the constant and to the variable (acceleration) +# this must always return a Matrix (rows = observations, columns = moments) +function ols_moments(prob, theta) + + resids = @. prob.data.mpg - theta[1] - theta[2] * prob.data.acceleration + + # n by 2 matrix of moments + moms = hcat(resids, resids .* prob.data.acceleration) + + return moms +end + +# initial parameter guess + # theta0 = [0.0, 0.0] + theta0 = randn(20,2) + +# create GMM problem (data + weighting matrix + initial parameter guess) + myprob = create_GMMProblem( + data=df, + W=I, + theta0=theta0) + +# estimation options +myopts = GMMTools.GMMOptions( + path="C:/git-repos/GMMTools.jl/examples/temp/", + optim_algo=NewtonTrustRegion(), + optim_autodiff=:forward, + write_iter=true, + clean_iter=true, + overwrite=true, + trace=1) + +# estimate model +myfit = fit(myprob, ols_moments, mode=:twostep, opts=myopts) + + + +# compute asymptotic variance-covariance matrix and save in myfit.vcov + vcov_simple(myprob, ols_moments, myfit) + +# print table with results + GMMTools.regtable(myfit) + + + + + + + + +# compute Bayesian (weighted) bootstrap inference and save in myfit.vcov + vcov_bboot(myprob, ols_moments, myfit, nboot=500) + GMMTools.regtable(myfit) # print table with new bootstrap SEs + +# bootstrap with weightes drawn at the level of clusters defined by the variable df.cylinders + vcov_bboot(myprob, ols_moments, myfit, cluster_var=:cylinders, nboot=500) + GMMTools.regtable(myfit) + +### Multiple initial conditions and save to file + theta0 = [0.3, 0.5] + + theta0_matrix = random_theta0(theta0, 100) # generate 100 random initial conditions "around" theta0 + + myprob = create_GMMProblem(data=df, W=I, theta0=theta0_matrix) + + myopts = default_gmm_opts( + path = "C:/git-repos/GMMTools.jl/examples/temp/", # replace with target folder on your machine + write_iter=true, # save results to file option + clean_iter=true, # save results to file option + overwrite=false, # save results to file option + trace=1 + ) + + myfit = fit(myprob, ols_moments, opts=myopts) + vcov_simple(myprob, ols_moments, myfit) + GMMTools.regtable(myfit) \ No newline at end of file diff --git a/examples/temp/step1/Wstep2.csv b/examples/temp/step1/Wstep2.csv new file mode 100644 index 0000000..f1817dd --- /dev/null +++ b/examples/temp/step1/Wstep2.csv @@ -0,0 +1,2 @@ +0.7160381402040416,-0.043487516296842874 +-0.043487516296842874,0.0027167729488171157 diff --git a/examples/temp/step1/results.csv b/examples/temp/step1/results.csv new file mode 100644 index 0000000..1a1be59 --- /dev/null +++ b/examples/temp/step1/results.csv @@ -0,0 +1,21 @@ +idx,obj_value,converged,iterations,iteration_limit_reached,time_it_took,theta_hat,theta0,is_optimum +1,6.32223644798054e-30,true,3,false,0.0004746,"[4.969793004254019, 1.191204529350221]","[-0.26926835124845394, 0.37954113694079494]",0 +2,4.271350173573995e-30,true,3,false,0.0003078,"[4.969793004253976, 1.1912045293502236]","[0.34556455271206793, 0.25004124299025193]",0 +3,6.015658220073783e-30,true,3,false,0.0002686,"[4.969793004254005, 1.1912045293502216]","[0.4086531076118486, 0.1122055213118399]",0 +4,5.169561555227248e-32,true,3,false,0.0002935,"[4.969793004253904, 1.1912045293502278]","[2.258597939824897, 0.549774319846248]",1 +5,5.4866839709717774e-30,true,3,false,0.0003357,"[4.9697930042539955, 1.1912045293502223]","[0.6682975725920416, -0.3046381451639788]",0 +6,2.858903272859906e-30,true,3,false,0.0003102,"[4.969793004253965, 1.1912045293502243]","[0.026238700144936913, 1.02011621705373]",0 +7,9.729989142153385e-30,true,3,false,0.0003966,"[4.969793004254015, 1.1912045293502214]","[-0.8084154014789137, 0.3737181172901421]",0 +8,1.2853701000725376e-30,true,3,false,0.0005991,"[4.9697930042539475, 1.1912045293502251]","[1.6553857540303305, 0.9772035784295662]",0 +9,3.1270864177685063e-30,true,3,false,0.000602,"[4.969793004253978, 1.1912045293502236]","[-0.4358533131420818, 0.8416885155869507]",0 +10,2.858903272859906e-30,true,3,false,0.0008496,"[4.969793004253966, 1.1912045293502243]","[-0.6746815710608132, 0.5703807098956672]",0 +11,4.887749809084658e-30,true,3,false,0.0005192,"[4.969793004253987, 1.1912045293502231]","[-0.637661275087828, 0.3134098736025502]",0 +12,7.679713427709319e-30,true,3,false,0.0003796,"[4.969793004254006, 1.1912045293502216]","[-0.45656749766775084, 0.6859254153377403]",0 +13,1.7022635656121709e-31,true,3,false,0.0003592,"[4.969793004253919, 1.191204529350227]","[0.3690356139154637, 1.1767982541932573]",0 +14,5.641065468573316e-30,true,3,false,0.0003084,"[4.969793004253992, 1.1912045293502227]","[0.1644377136788993, -1.0132404450445502]",0 +15,5.549664479706198e-31,true,3,false,0.0003051,"[4.969793004253944, 1.1912045293502254]","[1.2594059388919372, 0.5668443814183669]",0 +16,3.243040147200819e-30,true,3,false,0.0003208,"[4.969793004253968, 1.191204529350224]","[0.9085651769028054, 0.7723646262391767]",0 +17,1.139048765381014e-30,true,3,false,0.0004015,"[4.969793004253947, 1.1912045293502254]","[1.651285961528619, -1.119582694672335]",0 +18,5.549664479706198e-31,true,3,false,0.0002906,"[4.969793004253945, 1.1912045293502254]","[1.0247173742143418, -0.012301729081761994]",0 +19,7.367255564915692e-31,true,3,false,0.0003143,"[4.969793004253938, 1.1912045293502258]","[0.37133497370694063, 1.5754594070993344]",0 +20,8.60142516371406e-30,true,3,false,0.0002686,"[4.9697930042539955, 1.1912045293502225]","[0.23769577896671254, 0.11408936635142151]",0 diff --git a/examples/temp/step2/results.csv b/examples/temp/step2/results.csv new file mode 100644 index 0000000..1500677 --- /dev/null +++ b/examples/temp/step2/results.csv @@ -0,0 +1,21 @@ +idx,obj_value,converged,iterations,iteration_limit_reached,time_it_took,theta_hat,theta0,is_optimum +1,5.641065468573316e-30,true,3,false,0.0005577,"[4.969793004253992, 1.1912045293502227]","[-0.26926835124845394, 0.37954113694079494]",0 +2,1.0723735340833267e-30,true,3,false,0.0003425,"[4.9697930042539245, 1.191204529350227]","[0.34556455271206793, 0.25004124299025193]",0 +3,1.6938870425260983e-30,true,3,false,0.0003095,"[4.969793004253935, 1.1912045293502262]","[0.4086531076118486, 0.1122055213118399]",0 +4,1.6332434610828883e-30,true,3,false,0.0003961,"[4.969793004253906, 1.1912045293502276]","[2.258597939824897, 0.549774319846248]",0 +5,2.7977999857061576e-30,true,3,false,0.0003969,"[4.969793004253962, 1.1912045293502245]","[0.6682975725920416, -0.3046381451639788]",0 +6,7.159915972130363e-31,true,3,false,0.0003682,"[4.9697930042539396, 1.1912045293502258]","[0.026238700144936913, 1.02011621705373]",0 +7,2.4809926934350844e-30,true,3,false,0.0003491,"[4.969793004253951, 1.1912045293502254]","[-0.8084154014789137, 0.3737181172901421]",0 +8,2.047452152849753e-31,true,3,false,0.00047,"[4.96979300425392, 1.1912045293502267]","[1.6553857540303305, 0.9772035784295662]",1 +9,1.793812265532666e-30,true,3,false,0.0003686,"[4.96979300425396, 1.1912045293502247]","[-0.4358533131420818, 0.8416885155869507]",0 +10,2.5430461941547175e-30,true,3,false,0.0004063,"[4.969793004253958, 1.191204529350225]","[-0.6746815710608132, 0.5703807098956672]",0 +11,3.1270864177685133e-30,true,3,false,0.0002817,"[4.969793004253978, 1.1912045293502236]","[-0.637661275087828, 0.3134098736025502]",0 +12,2.4809926934350844e-30,true,3,false,0.0002497,"[4.969793004253951, 1.1912045293502254]","[-0.45656749766775084, 0.6859254153377403]",0 +13,7.367255564915703e-31,true,3,false,0.0002964,"[4.969793004253938, 1.1912045293502258]","[0.3690356139154637, 1.1767982541932573]",0 +14,3.278693875234676e-30,true,3,false,0.0002763,"[4.969793004253954, 1.1912045293502251]","[0.1644377136788993, -1.0132404450445502]",0 +15,7.367255564915703e-31,true,3,false,0.0002619,"[4.969793004253936, 1.1912045293502258]","[1.2594059388919372, 0.5668443814183669]",0 +16,8.641601728373638e-31,true,3,false,0.0003081,"[4.969793004253933, 1.1912045293502262]","[0.9085651769028054, 0.7723646262391767]",0 +17,1.1976327868854149e-30,true,3,false,0.0002527,"[4.969793004253946, 1.1912045293502256]","[1.651285961528619, -1.119582694672335]",0 +18,2.824755159717008e-30,true,3,false,0.0029855,"[4.969793004253955, 1.191204529350225]","[1.0247173742143418, -0.012301729081761994]",0 +19,8.641601728373638e-31,true,3,false,0.0003302,"[4.969793004253933, 1.1912045293502262]","[0.37133497370694063, 1.5754594070993344]",0 +20,2.095024388009941e-30,true,3,false,0.0003107,"[4.969793004253937, 1.1912045293502262]","[0.23769577896671254, 0.11408936635142151]",0 diff --git a/src/GMMTools.jl b/src/GMMTools.jl index 967df42..d348200 100644 --- a/src/GMMTools.jl +++ b/src/GMMTools.jl @@ -12,8 +12,9 @@ using StatsAPI # for (Bayesian) bootstrap using StatsBase # take samples -using Distributions # Dirichlet distribution +using Distributions # Dirichlet distribution for bootstrap +using DelimitedFiles using CSV using JSON diff --git a/src/functions_estimation.jl b/src/functions_estimation.jl index 68e538a..e99a489 100644 --- a/src/functions_estimation.jl +++ b/src/functions_estimation.jl @@ -151,7 +151,8 @@ end Base.@kwdef mutable struct GMMOptions path::String = "" - autodiff::Symbol = :none + optim_autodiff::Symbol = :none + optim_algo = LBFGS() optim_opts = nothing write_iter::Bool = false # write to file each result (each initial run) clean_iter::Bool = false # @@ -169,7 +170,8 @@ end function default_gmm_opts(; path = "", optim_opts=default_optim_opts(), - autodiff=:none, + optim_autodiff=:none, + optim_algo=LBFGS(), write_iter = false, # write to file each result (each initial run) clean_iter = false, # overwrite = false, @@ -177,7 +179,8 @@ function default_gmm_opts(; return GMMOptions( path=path, - autodiff=autodiff, + optim_autodiff=optim_autodiff, + optim_algo=optim_algo, optim_opts=optim_opts, write_iter=write_iter, clean_iter=clean_iter, @@ -188,7 +191,8 @@ end # extent the "copy" method to the GMMOptions type Base.copy(x::GMMOptions) = GMMOptions([getfield(x, k) for k ∈ fieldnames(GMMOptions)]...) -function write(est_result::GMMResult, opts::GMMOptions; subpath="", filename="") +function write(est_result::GMMResult, opts::GMMOptions, filename; subpath="") + if opts.path == "" return end @@ -196,26 +200,27 @@ function write(est_result::GMMResult, opts::GMMOptions; subpath="", filename="") if subpath == "" full_path = opts.path else - (subpath[end] != "/") && (subpath *= "/") + (subpath[end] != '/') && (subpath *= "/") full_path = opts.path * subpath isdir(full_path) || mkdir(full_path) end - (filename == "") ? filename = "results.csv" : nothing - - file_path = full_path * filename - - CSV.write(file_path, table(est_result)) + CSV.write(full_path * filename, table(est_result)) end -function parse_vector(s::String) +function parse_vector(s::AbstractString) return parse.(Float64, split(s[2:(end-1)],",")) end -function load(prob::GMMProblem, opts::GMMOptions, filepath::String) +function load(prob::GMMProblem, opts::GMMOptions; filepath="") + + if filepath == "" + full_path = opts.path * "results.csv" + else + full_path = opts.path * filepath + end - full_path = opts.path * filepath if isfile(full_path) df = CSV.read(full_path, DataFrame) @@ -255,6 +260,16 @@ function load(prob::GMMProblem, opts::GMMOptions, filepath::String) end end +function clean_iter(opts) + try + print("Deleting intermediate files from: ", opts.path) + rm(opts.path * "__iter__/", force=true, recursive=true) + println(" Done.") + catch e + println(" Error while deleting intermediate files from : ", opts.path, ". Error: ", e) + end +end + function add_nobs(myfit::GMMResult, problem::GMMProblem, mom_fn::Function) try myfit.N = size(problem.data, 1) @@ -321,25 +336,36 @@ function fit_twostep( run_parallel=true, opts=default_gmm_opts()) - # Step 1 + main_path = opts.path + (main_path[end] != '/') && (main_path *= "/") + + ### Step 1 + (opts.trace > 0) && println(">>> Starting GMM step 1.") + opts.path = main_path * "step1/" + isdir(opts.path) || mkdir(opts.path) + fit_step1 = fit_onestep( problem, mom_fn, run_parallel=run_parallel, opts=opts) - # optimal weight matrix + ### optimal weight matrix m = mom_fn(problem, fit_step1.theta_hat) nmomsize = size(m, 1) - println("number of observations: ", nmomsize) + # (opts.trace > 0) && println("number of observations: ", nmomsize) Wstep2 = Hermitian(transpose(m) * m / nmomsize) Wstep2 = inv(Wstep2) problem.W = Wstep2 - display(Wstep2) + # Save Wstep2 to file + writedlm( opts.path * "Wstep2.csv", Wstep2, ',') - # TODO: save Wstep2 to file + ### Step 2 + (opts.trace > 0) && println(">>> Starting GMM step 2.") + opts.path = main_path * "step2/" + isdir(opts.path) || mkdir(opts.path) fit_step2 = fit_onestep( problem, @@ -347,6 +373,9 @@ function fit_twostep( run_parallel=run_parallel, opts=opts) + # revert + opts.path = main_path + return fit_step2 end @@ -363,11 +392,15 @@ function fit_onestep( # skip if output file already exists if !opts.overwrite && (opts.path != "") - opt_results_from_file = load(problem, opts, "results.csv") + opt_results_from_file = load(problem, opts) if !isnothing(opt_results_from_file) - println(" Results already exist. Reading from file.") + println(" Results file already exists. Reading from file.") add_nobs(opt_results_from_file, problem, mom_fn) + + # delete all intermediate files with individual iteration results + opts.clean_iter && clean_iter(opts) + return opt_results_from_file end end @@ -388,14 +421,10 @@ function fit_onestep( add_nobs(best_result, problem, mom_fn) # save results to file? - (opts.path != "") && write(best_result, opts, filename="results.csv") + (opts.path != "") && write(best_result, opts, "results.csv") # delete all intermediate files with individual iteration results - if opts.clean_iter - print("Deleting intermediate files...") - rm(opts.path * "__iter__/", force=true, recursive=true) - println(" Done.") - end + opts.clean_iter && clean_iter(opts) return best_result end @@ -412,7 +441,7 @@ function fit_onerun(idx::Int64, problem::GMMProblem, mom_fn::Function, opts::GMM # skip if output file already exists if !opts.overwrite && (opts.path != "") - opt_results_from_file = load(problem, opts, "__iter__/results_" * string(idx) * ".csv") + opt_results_from_file = load(problem, opts, filepath="__iter__/results_" * string(idx) * ".csv") if !isnothing(opt_results_from_file) (opts.trace > 0) && println(" Reading from file.") @@ -427,11 +456,21 @@ function fit_onerun(idx::Int64, problem::GMMProblem, mom_fn::Function, opts::GMM gmm_objective_loaded = theta -> gmm_objective(theta, problem, mom_fn, trace=opts.trace) # optimize - if opts.autodiff == :forward - println("using AD") - time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, theta0, LBFGS(), opts.optim_opts, autodiff=:forward) + if opts.optim_autodiff == :forward + (opts.trace > 0) && print("using AD") + time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, + theta0, + opts.optim_algo, # defalut = LBFGS() + opts.optim_opts, + autodiff=:forward) + + # results = optimize(f, g!, lower, upper, initial_x, Fminbox(GradientDescent()), Optim.Options(outer_iterations = 2)) + else - time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, theta0, opts.optim_opts) + time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, + theta0, + opts.optim_algo, # defalut = LBFGS() + opts.optim_opts) end #= summary(res) @@ -461,7 +500,7 @@ function fit_onerun(idx::Int64, problem::GMMProblem, mom_fn::Function, opts::GMM # write intermediate results to file if opts.write_iter (opts.trace > 0) && println(" Done and done writing to file.") - write(opt_results, opts, subpath="__iter__", filename="results_" * string(idx) * ".csv") + write(opt_results, opts, "results_" * string(idx) * ".csv", subpath="__iter__") else (opts.trace > 0) && println(" Done. ") end