two step GMM and testing

Gkreindler · Dec 21, 2023 · c51986d · c51986d
1 parent 8a927e0
commit c51986d
Show file tree

Hide file tree

Showing 7 changed files with 220 additions and 33 deletions.
diff --git a/Project.toml b/Project.toml
@@ -6,6 +6,7 @@ version = "0.1.0"
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"

diff --git a/examples/example_ols_2step.jl b/examples/example_ols_2step.jl
@@ -0,0 +1,102 @@
+using Pkg
+Pkg.activate(".")
+Pkg.resolve()
+Pkg.instantiate()
+
+using Revise
+using LinearAlgebra # for identity matrix "I"
+using CSV
+using DataFrames
+using FixedEffectModels # for benchmarking
+using RegressionTables
+
+using GMMTools
+using Optim # need for NewtonTrustRegion()
+
+
+# load data, originally from: https://www.kaggle.com/datasets/uciml/autompg-dataset/?select=auto-mpg.csv 
+    df = CSV.read("examples/auto-mpg.csv", DataFrame)
+    df[!, :constant] .= 1.0
+
+
+# Run plain OLS for comparison
+    r = reg(df, term(:mpg) ~ term(:acceleration))
+    RegressionTables.regtable(r)
+
+# define moments for OLS regression
+# residuals orthogonal to the constant and to the variable (acceleration)
+# this must always return a Matrix (rows = observations, columns = moments)
+function ols_moments(prob, theta)
+
+    resids = @. prob.data.mpg - theta[1] - theta[2] * prob.data.acceleration
+
+    # n by 2 matrix of moments
+    moms = hcat(resids, resids .* prob.data.acceleration)
+
+    return moms
+end
+
+# initial parameter guess
+    # theta0 = [0.0, 0.0]
+    theta0 = randn(20,2)
+
+# create GMM problem (data + weighting matrix + initial parameter guess)
+    myprob = create_GMMProblem(
+                data=df, 
+                W=I,    
+                theta0=theta0)
+
+# estimation options
+myopts = GMMTools.GMMOptions(
+                path="C:/git-repos/GMMTools.jl/examples/temp/", 
+                optim_algo=NewtonTrustRegion(), 
+                optim_autodiff=:forward,
+                write_iter=true,
+                clean_iter=true,
+                overwrite=true,
+                trace=1)
+
+# estimate model
+myfit = fit(myprob, ols_moments, mode=:twostep, opts=myopts)
+
+
+
+# compute asymptotic variance-covariance matrix and save in myfit.vcov
+    vcov_simple(myprob, ols_moments, myfit)
+
+# print table with results
+    GMMTools.regtable(myfit)
+
+
+
+
+
+
+
+
+# compute Bayesian (weighted) bootstrap inference and save in myfit.vcov
+    vcov_bboot(myprob, ols_moments, myfit, nboot=500)
+    GMMTools.regtable(myfit) # print table with new bootstrap SEs
+
+# bootstrap with weightes drawn at the level of clusters defined by the variable df.cylinders
+    vcov_bboot(myprob, ols_moments, myfit, cluster_var=:cylinders, nboot=500)
+    GMMTools.regtable(myfit)
+
+### Multiple initial conditions and save to file
+    theta0 = [0.3, 0.5]
+
+    theta0_matrix = random_theta0(theta0, 100) # generate 100 random initial conditions "around" theta0
+
+    myprob = create_GMMProblem(data=df, W=I, theta0=theta0_matrix)
+
+    myopts = default_gmm_opts(
+        path = "C:/git-repos/GMMTools.jl/examples/temp/", # replace with target folder on your machine
+        write_iter=true, # save results to file option
+        clean_iter=true, # save results to file option
+        overwrite=false,  # save results to file option
+        trace=1
+        )
+
+    myfit = fit(myprob, ols_moments, opts=myopts)
+    vcov_simple(myprob, ols_moments, myfit)
+    GMMTools.regtable(myfit)
diff --git a/examples/temp/step1/Wstep2.csv b/examples/temp/step1/Wstep2.csv
@@ -0,0 +1,2 @@
+0.7160381402040416,-0.043487516296842874
+-0.043487516296842874,0.0027167729488171157
diff --git a/examples/temp/step1/results.csv b/examples/temp/step1/results.csv
@@ -0,0 +1,21 @@
+idx,obj_value,converged,iterations,iteration_limit_reached,time_it_took,theta_hat,theta0,is_optimum
+1,6.32223644798054e-30,true,3,false,0.0004746,"[4.969793004254019, 1.191204529350221]","[-0.26926835124845394, 0.37954113694079494]",0
+2,4.271350173573995e-30,true,3,false,0.0003078,"[4.969793004253976, 1.1912045293502236]","[0.34556455271206793, 0.25004124299025193]",0
+3,6.015658220073783e-30,true,3,false,0.0002686,"[4.969793004254005, 1.1912045293502216]","[0.4086531076118486, 0.1122055213118399]",0
+4,5.169561555227248e-32,true,3,false,0.0002935,"[4.969793004253904, 1.1912045293502278]","[2.258597939824897, 0.549774319846248]",1
+5,5.4866839709717774e-30,true,3,false,0.0003357,"[4.9697930042539955, 1.1912045293502223]","[0.6682975725920416, -0.3046381451639788]",0
+6,2.858903272859906e-30,true,3,false,0.0003102,"[4.969793004253965, 1.1912045293502243]","[0.026238700144936913, 1.02011621705373]",0
+7,9.729989142153385e-30,true,3,false,0.0003966,"[4.969793004254015, 1.1912045293502214]","[-0.8084154014789137, 0.3737181172901421]",0
+8,1.2853701000725376e-30,true,3,false,0.0005991,"[4.9697930042539475, 1.1912045293502251]","[1.6553857540303305, 0.9772035784295662]",0
+9,3.1270864177685063e-30,true,3,false,0.000602,"[4.969793004253978, 1.1912045293502236]","[-0.4358533131420818, 0.8416885155869507]",0
+10,2.858903272859906e-30,true,3,false,0.0008496,"[4.969793004253966, 1.1912045293502243]","[-0.6746815710608132, 0.5703807098956672]",0
+11,4.887749809084658e-30,true,3,false,0.0005192,"[4.969793004253987, 1.1912045293502231]","[-0.637661275087828, 0.3134098736025502]",0
+12,7.679713427709319e-30,true,3,false,0.0003796,"[4.969793004254006, 1.1912045293502216]","[-0.45656749766775084, 0.6859254153377403]",0
+13,1.7022635656121709e-31,true,3,false,0.0003592,"[4.969793004253919, 1.191204529350227]","[0.3690356139154637, 1.1767982541932573]",0
+14,5.641065468573316e-30,true,3,false,0.0003084,"[4.969793004253992, 1.1912045293502227]","[0.1644377136788993, -1.0132404450445502]",0
+15,5.549664479706198e-31,true,3,false,0.0003051,"[4.969793004253944, 1.1912045293502254]","[1.2594059388919372, 0.5668443814183669]",0
+16,3.243040147200819e-30,true,3,false,0.0003208,"[4.969793004253968, 1.191204529350224]","[0.9085651769028054, 0.7723646262391767]",0
+17,1.139048765381014e-30,true,3,false,0.0004015,"[4.969793004253947, 1.1912045293502254]","[1.651285961528619, -1.119582694672335]",0
+18,5.549664479706198e-31,true,3,false,0.0002906,"[4.969793004253945, 1.1912045293502254]","[1.0247173742143418, -0.012301729081761994]",0
+19,7.367255564915692e-31,true,3,false,0.0003143,"[4.969793004253938, 1.1912045293502258]","[0.37133497370694063, 1.5754594070993344]",0
+20,8.60142516371406e-30,true,3,false,0.0002686,"[4.9697930042539955, 1.1912045293502225]","[0.23769577896671254, 0.11408936635142151]",0
diff --git a/examples/temp/step2/results.csv b/examples/temp/step2/results.csv
@@ -0,0 +1,21 @@
+idx,obj_value,converged,iterations,iteration_limit_reached,time_it_took,theta_hat,theta0,is_optimum
+1,5.641065468573316e-30,true,3,false,0.0005577,"[4.969793004253992, 1.1912045293502227]","[-0.26926835124845394, 0.37954113694079494]",0
+2,1.0723735340833267e-30,true,3,false,0.0003425,"[4.9697930042539245, 1.191204529350227]","[0.34556455271206793, 0.25004124299025193]",0
+3,1.6938870425260983e-30,true,3,false,0.0003095,"[4.969793004253935, 1.1912045293502262]","[0.4086531076118486, 0.1122055213118399]",0
+4,1.6332434610828883e-30,true,3,false,0.0003961,"[4.969793004253906, 1.1912045293502276]","[2.258597939824897, 0.549774319846248]",0
+5,2.7977999857061576e-30,true,3,false,0.0003969,"[4.969793004253962, 1.1912045293502245]","[0.6682975725920416, -0.3046381451639788]",0
+6,7.159915972130363e-31,true,3,false,0.0003682,"[4.9697930042539396, 1.1912045293502258]","[0.026238700144936913, 1.02011621705373]",0
+7,2.4809926934350844e-30,true,3,false,0.0003491,"[4.969793004253951, 1.1912045293502254]","[-0.8084154014789137, 0.3737181172901421]",0
+8,2.047452152849753e-31,true,3,false,0.00047,"[4.96979300425392, 1.1912045293502267]","[1.6553857540303305, 0.9772035784295662]",1
+9,1.793812265532666e-30,true,3,false,0.0003686,"[4.96979300425396, 1.1912045293502247]","[-0.4358533131420818, 0.8416885155869507]",0
+10,2.5430461941547175e-30,true,3,false,0.0004063,"[4.969793004253958, 1.191204529350225]","[-0.6746815710608132, 0.5703807098956672]",0
+11,3.1270864177685133e-30,true,3,false,0.0002817,"[4.969793004253978, 1.1912045293502236]","[-0.637661275087828, 0.3134098736025502]",0
+12,2.4809926934350844e-30,true,3,false,0.0002497,"[4.969793004253951, 1.1912045293502254]","[-0.45656749766775084, 0.6859254153377403]",0
+13,7.367255564915703e-31,true,3,false,0.0002964,"[4.969793004253938, 1.1912045293502258]","[0.3690356139154637, 1.1767982541932573]",0
+14,3.278693875234676e-30,true,3,false,0.0002763,"[4.969793004253954, 1.1912045293502251]","[0.1644377136788993, -1.0132404450445502]",0
+15,7.367255564915703e-31,true,3,false,0.0002619,"[4.969793004253936, 1.1912045293502258]","[1.2594059388919372, 0.5668443814183669]",0
+16,8.641601728373638e-31,true,3,false,0.0003081,"[4.969793004253933, 1.1912045293502262]","[0.9085651769028054, 0.7723646262391767]",0
+17,1.1976327868854149e-30,true,3,false,0.0002527,"[4.969793004253946, 1.1912045293502256]","[1.651285961528619, -1.119582694672335]",0
+18,2.824755159717008e-30,true,3,false,0.0029855,"[4.969793004253955, 1.191204529350225]","[1.0247173742143418, -0.012301729081761994]",0
+19,8.641601728373638e-31,true,3,false,0.0003302,"[4.969793004253933, 1.1912045293502262]","[0.37133497370694063, 1.5754594070993344]",0
+20,2.095024388009941e-30,true,3,false,0.0003107,"[4.969793004253937, 1.1912045293502262]","[0.23769577896671254, 0.11408936635142151]",0
diff --git a/src/GMMTools.jl b/src/GMMTools.jl
@@ -12,8 +12,9 @@ using StatsAPI
 
 # for (Bayesian) bootstrap
 using StatsBase # take samples
-using Distributions # Dirichlet distribution
+using Distributions # Dirichlet distribution for bootstrap
 
+using DelimitedFiles
 using CSV 
 using JSON
 

diff --git a/src/functions_estimation.jl b/src/functions_estimation.jl
@@ -151,7 +151,8 @@ end
 
 Base.@kwdef mutable struct GMMOptions
     path::String = ""
-    autodiff::Symbol = :none
+    optim_autodiff::Symbol = :none
+    optim_algo = LBFGS()
     optim_opts = nothing
     write_iter::Bool = false    # write to file each result (each initial run)
     clean_iter::Bool = false    # 
@@ -169,15 +170,17 @@ end
 function default_gmm_opts(;
     path = "",
     optim_opts=default_optim_opts(),
-    autodiff=:none,
+    optim_autodiff=:none,
+    optim_algo=LBFGS(),
     write_iter = false,    # write to file each result (each initial run)
     clean_iter = false,    # 
     overwrite = false,
     trace = 0)
 
     return GMMOptions(
         path=path,
-        autodiff=autodiff,
+        optim_autodiff=optim_autodiff,
+        optim_algo=optim_algo,
         optim_opts=optim_opts,
         write_iter=write_iter,
         clean_iter=clean_iter,
@@ -188,34 +191,36 @@ end
 # extent the "copy" method to the GMMOptions type
 Base.copy(x::GMMOptions) = GMMOptions([getfield(x, k) for k ∈ fieldnames(GMMOptions)]...)
 
-function write(est_result::GMMResult, opts::GMMOptions; subpath="", filename="")
+function write(est_result::GMMResult, opts::GMMOptions, filename; subpath="")
+
     if opts.path == ""
         return
     end
 
     if subpath == ""
         full_path = opts.path
     else
-        (subpath[end] != "/") && (subpath *= "/")
+        (subpath[end] != '/') && (subpath *= "/")
         full_path = opts.path * subpath
         isdir(full_path) || mkdir(full_path)
     end
 
-    (filename == "") ? filename = "results.csv" : nothing
-
-    file_path = full_path * filename
-
-    CSV.write(file_path, table(est_result))
+    CSV.write(full_path * filename, table(est_result))
 end
 
 
-function parse_vector(s::String)
+function parse_vector(s::AbstractString)
     return parse.(Float64, split(s[2:(end-1)],","))
 end
 
-function load(prob::GMMProblem, opts::GMMOptions, filepath::String)
+function load(prob::GMMProblem, opts::GMMOptions; filepath="")
+
+    if filepath == ""
+        full_path = opts.path * "results.csv"
+    else
+        full_path = opts.path * filepath
+    end
 
-    full_path = opts.path * filepath
     if isfile(full_path)
         df = CSV.read(full_path, DataFrame)
 
@@ -255,6 +260,16 @@ function load(prob::GMMProblem, opts::GMMOptions, filepath::String)
     end
 end
 
+function clean_iter(opts)
+    try
+        print("Deleting intermediate files from: ", opts.path)
+        rm(opts.path * "__iter__/", force=true, recursive=true)
+        println(" Done.")
+    catch e
+        println(" Error while deleting intermediate files from : ", opts.path, ". Error: ", e)
+    end
+end
+
 function add_nobs(myfit::GMMResult, problem::GMMProblem, mom_fn::Function)
     try
         myfit.N = size(problem.data, 1)
@@ -321,32 +336,46 @@ function fit_twostep(
     run_parallel=true, 
     opts=default_gmm_opts())
 
-    # Step 1
+    main_path = opts.path
+    (main_path[end] != '/') && (main_path *= "/")
+
+    ### Step 1
+    (opts.trace > 0) && println(">>> Starting GMM step 1.")
+    opts.path = main_path * "step1/"
+    isdir(opts.path) || mkdir(opts.path)
+
     fit_step1 = fit_onestep(
         problem,
         mom_fn,
         run_parallel=run_parallel,
         opts=opts)
 
-    # optimal weight matrix
+    ### optimal weight matrix
     m = mom_fn(problem, fit_step1.theta_hat)
     nmomsize = size(m, 1)
-    println("number of observations: ", nmomsize)
+    # (opts.trace > 0) && println("number of observations: ", nmomsize)
 
     Wstep2 = Hermitian(transpose(m) * m / nmomsize)
     Wstep2 = inv(Wstep2)
     problem.W = Wstep2
 
-    display(Wstep2)
+    # Save Wstep2 to file
+    writedlm( opts.path * "Wstep2.csv",  Wstep2, ',')
 
-    # TODO: save Wstep2 to file
+    ### Step 2
+    (opts.trace > 0) && println(">>> Starting GMM step 2.")
+    opts.path = main_path * "step2/"
+    isdir(opts.path) || mkdir(opts.path)
 
     fit_step2 = fit_onestep(
         problem,
         mom_fn,
         run_parallel=run_parallel,
         opts=opts)
 
+    # revert
+    opts.path = main_path
+
     return fit_step2
 end
 
@@ -363,11 +392,15 @@ function fit_onestep(
     # skip if output file already exists
     if !opts.overwrite && (opts.path != "")
 
-        opt_results_from_file = load(problem, opts, "results.csv")
+        opt_results_from_file = load(problem, opts)
 
         if !isnothing(opt_results_from_file)
-            println(" Results already exist. Reading from file.")
+            println(" Results file already exists. Reading from file.")
             add_nobs(opt_results_from_file, problem, mom_fn)
+
+            # delete all intermediate files with individual iteration results
+            opts.clean_iter && clean_iter(opts)
+
             return opt_results_from_file
         end
     end
@@ -388,14 +421,10 @@ function fit_onestep(
     add_nobs(best_result, problem, mom_fn)
 
     # save results to file?
-    (opts.path != "") && write(best_result, opts, filename="results.csv")
+    (opts.path != "") && write(best_result, opts, "results.csv")
 
     # delete all intermediate files with individual iteration results
-    if opts.clean_iter 
-        print("Deleting intermediate files...")
-        rm(opts.path * "__iter__/", force=true, recursive=true)
-        println(" Done.")
-    end
+    opts.clean_iter && clean_iter(opts)
 
     return best_result
 end
@@ -412,7 +441,7 @@ function fit_onerun(idx::Int64, problem::GMMProblem, mom_fn::Function, opts::GMM
     # skip if output file already exists
     if !opts.overwrite && (opts.path != "")
 
-        opt_results_from_file = load(problem, opts, "__iter__/results_" * string(idx) * ".csv")
+        opt_results_from_file = load(problem, opts, filepath="__iter__/results_" * string(idx) * ".csv")
 
         if !isnothing(opt_results_from_file)
             (opts.trace > 0) && println(" Reading from file.")
@@ -427,11 +456,21 @@ function fit_onerun(idx::Int64, problem::GMMProblem, mom_fn::Function, opts::GMM
     gmm_objective_loaded = theta -> gmm_objective(theta, problem, mom_fn, trace=opts.trace)
 
     # optimize
-    if opts.autodiff == :forward
-        println("using AD")
-        time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, theta0, LBFGS(), opts.optim_opts, autodiff=:forward)
+    if opts.optim_autodiff == :forward
+        (opts.trace > 0) && print("using AD")
+        time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, 
+                                                                theta0, 
+                                                                opts.optim_algo, # defalut = LBFGS()
+                                                                opts.optim_opts, 
+                                                                autodiff=:forward)
+
+        # results = optimize(f, g!, lower, upper, initial_x, Fminbox(GradientDescent()), Optim.Options(outer_iterations = 2))
+
     else
-        time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, theta0, opts.optim_opts)
+        time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, 
+                                                                theta0, 
+                                                                opts.optim_algo, # defalut = LBFGS()
+                                                                opts.optim_opts)
     end
     #= 
     summary(res)
@@ -461,7 +500,7 @@ function fit_onerun(idx::Int64, problem::GMMProblem, mom_fn::Function, opts::GMM
     # write intermediate results to file
     if opts.write_iter 
         (opts.trace > 0) && println(" Done and done writing to file.")
-        write(opt_results, opts, subpath="__iter__", filename="results_" * string(idx) * ".csv")
+        write(opt_results, opts, "results_" * string(idx) * ".csv", subpath="__iter__")
     else
         (opts.trace > 0) && println(" Done. ")
     end
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		0.7160381402040416,-0.043487516296842874
		-0.043487516296842874,0.0027167729488171157