Skip to content

Commit

Permalink
two step GMM and testing
Browse files Browse the repository at this point in the history
  • Loading branch information
Gkreindler committed Dec 21, 2023
1 parent 8a927e0 commit c51986d
Show file tree
Hide file tree
Showing 7 changed files with 220 additions and 33 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ version = "0.1.0"
[deps]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
Expand Down
102 changes: 102 additions & 0 deletions examples/example_ols_2step.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
using Pkg
Pkg.activate(".")
Pkg.resolve()
Pkg.instantiate()

using Revise
using LinearAlgebra # for identity matrix "I"
using CSV
using DataFrames
using FixedEffectModels # for benchmarking
using RegressionTables

using GMMTools
using Optim # need for NewtonTrustRegion()


# load data, originally from: https://www.kaggle.com/datasets/uciml/autompg-dataset/?select=auto-mpg.csv
df = CSV.read("examples/auto-mpg.csv", DataFrame)
df[!, :constant] .= 1.0


# Run plain OLS for comparison
r = reg(df, term(:mpg) ~ term(:acceleration))
RegressionTables.regtable(r)

# define moments for OLS regression
# residuals orthogonal to the constant and to the variable (acceleration)
# this must always return a Matrix (rows = observations, columns = moments)
function ols_moments(prob, theta)

resids = @. prob.data.mpg - theta[1] - theta[2] * prob.data.acceleration

# n by 2 matrix of moments
moms = hcat(resids, resids .* prob.data.acceleration)

return moms
end

# initial parameter guess
# theta0 = [0.0, 0.0]
theta0 = randn(20,2)

# create GMM problem (data + weighting matrix + initial parameter guess)
myprob = create_GMMProblem(
data=df,
W=I,
theta0=theta0)

# estimation options
myopts = GMMTools.GMMOptions(
path="C:/git-repos/GMMTools.jl/examples/temp/",
optim_algo=NewtonTrustRegion(),
optim_autodiff=:forward,
write_iter=true,
clean_iter=true,
overwrite=true,
trace=1)

# estimate model
myfit = fit(myprob, ols_moments, mode=:twostep, opts=myopts)



# compute asymptotic variance-covariance matrix and save in myfit.vcov
vcov_simple(myprob, ols_moments, myfit)

# print table with results
GMMTools.regtable(myfit)








# compute Bayesian (weighted) bootstrap inference and save in myfit.vcov
vcov_bboot(myprob, ols_moments, myfit, nboot=500)
GMMTools.regtable(myfit) # print table with new bootstrap SEs

# bootstrap with weightes drawn at the level of clusters defined by the variable df.cylinders
vcov_bboot(myprob, ols_moments, myfit, cluster_var=:cylinders, nboot=500)
GMMTools.regtable(myfit)

### Multiple initial conditions and save to file
theta0 = [0.3, 0.5]

theta0_matrix = random_theta0(theta0, 100) # generate 100 random initial conditions "around" theta0

myprob = create_GMMProblem(data=df, W=I, theta0=theta0_matrix)

myopts = default_gmm_opts(
path = "C:/git-repos/GMMTools.jl/examples/temp/", # replace with target folder on your machine
write_iter=true, # save results to file option
clean_iter=true, # save results to file option
overwrite=false, # save results to file option
trace=1
)

myfit = fit(myprob, ols_moments, opts=myopts)
vcov_simple(myprob, ols_moments, myfit)
GMMTools.regtable(myfit)
2 changes: 2 additions & 0 deletions examples/temp/step1/Wstep2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
0.7160381402040416,-0.043487516296842874
-0.043487516296842874,0.0027167729488171157
21 changes: 21 additions & 0 deletions examples/temp/step1/results.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
idx,obj_value,converged,iterations,iteration_limit_reached,time_it_took,theta_hat,theta0,is_optimum
1,6.32223644798054e-30,true,3,false,0.0004746,"[4.969793004254019, 1.191204529350221]","[-0.26926835124845394, 0.37954113694079494]",0
2,4.271350173573995e-30,true,3,false,0.0003078,"[4.969793004253976, 1.1912045293502236]","[0.34556455271206793, 0.25004124299025193]",0
3,6.015658220073783e-30,true,3,false,0.0002686,"[4.969793004254005, 1.1912045293502216]","[0.4086531076118486, 0.1122055213118399]",0
4,5.169561555227248e-32,true,3,false,0.0002935,"[4.969793004253904, 1.1912045293502278]","[2.258597939824897, 0.549774319846248]",1
5,5.4866839709717774e-30,true,3,false,0.0003357,"[4.9697930042539955, 1.1912045293502223]","[0.6682975725920416, -0.3046381451639788]",0
6,2.858903272859906e-30,true,3,false,0.0003102,"[4.969793004253965, 1.1912045293502243]","[0.026238700144936913, 1.02011621705373]",0
7,9.729989142153385e-30,true,3,false,0.0003966,"[4.969793004254015, 1.1912045293502214]","[-0.8084154014789137, 0.3737181172901421]",0
8,1.2853701000725376e-30,true,3,false,0.0005991,"[4.9697930042539475, 1.1912045293502251]","[1.6553857540303305, 0.9772035784295662]",0
9,3.1270864177685063e-30,true,3,false,0.000602,"[4.969793004253978, 1.1912045293502236]","[-0.4358533131420818, 0.8416885155869507]",0
10,2.858903272859906e-30,true,3,false,0.0008496,"[4.969793004253966, 1.1912045293502243]","[-0.6746815710608132, 0.5703807098956672]",0
11,4.887749809084658e-30,true,3,false,0.0005192,"[4.969793004253987, 1.1912045293502231]","[-0.637661275087828, 0.3134098736025502]",0
12,7.679713427709319e-30,true,3,false,0.0003796,"[4.969793004254006, 1.1912045293502216]","[-0.45656749766775084, 0.6859254153377403]",0
13,1.7022635656121709e-31,true,3,false,0.0003592,"[4.969793004253919, 1.191204529350227]","[0.3690356139154637, 1.1767982541932573]",0
14,5.641065468573316e-30,true,3,false,0.0003084,"[4.969793004253992, 1.1912045293502227]","[0.1644377136788993, -1.0132404450445502]",0
15,5.549664479706198e-31,true,3,false,0.0003051,"[4.969793004253944, 1.1912045293502254]","[1.2594059388919372, 0.5668443814183669]",0
16,3.243040147200819e-30,true,3,false,0.0003208,"[4.969793004253968, 1.191204529350224]","[0.9085651769028054, 0.7723646262391767]",0
17,1.139048765381014e-30,true,3,false,0.0004015,"[4.969793004253947, 1.1912045293502254]","[1.651285961528619, -1.119582694672335]",0
18,5.549664479706198e-31,true,3,false,0.0002906,"[4.969793004253945, 1.1912045293502254]","[1.0247173742143418, -0.012301729081761994]",0
19,7.367255564915692e-31,true,3,false,0.0003143,"[4.969793004253938, 1.1912045293502258]","[0.37133497370694063, 1.5754594070993344]",0
20,8.60142516371406e-30,true,3,false,0.0002686,"[4.9697930042539955, 1.1912045293502225]","[0.23769577896671254, 0.11408936635142151]",0
21 changes: 21 additions & 0 deletions examples/temp/step2/results.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
idx,obj_value,converged,iterations,iteration_limit_reached,time_it_took,theta_hat,theta0,is_optimum
1,5.641065468573316e-30,true,3,false,0.0005577,"[4.969793004253992, 1.1912045293502227]","[-0.26926835124845394, 0.37954113694079494]",0
2,1.0723735340833267e-30,true,3,false,0.0003425,"[4.9697930042539245, 1.191204529350227]","[0.34556455271206793, 0.25004124299025193]",0
3,1.6938870425260983e-30,true,3,false,0.0003095,"[4.969793004253935, 1.1912045293502262]","[0.4086531076118486, 0.1122055213118399]",0
4,1.6332434610828883e-30,true,3,false,0.0003961,"[4.969793004253906, 1.1912045293502276]","[2.258597939824897, 0.549774319846248]",0
5,2.7977999857061576e-30,true,3,false,0.0003969,"[4.969793004253962, 1.1912045293502245]","[0.6682975725920416, -0.3046381451639788]",0
6,7.159915972130363e-31,true,3,false,0.0003682,"[4.9697930042539396, 1.1912045293502258]","[0.026238700144936913, 1.02011621705373]",0
7,2.4809926934350844e-30,true,3,false,0.0003491,"[4.969793004253951, 1.1912045293502254]","[-0.8084154014789137, 0.3737181172901421]",0
8,2.047452152849753e-31,true,3,false,0.00047,"[4.96979300425392, 1.1912045293502267]","[1.6553857540303305, 0.9772035784295662]",1
9,1.793812265532666e-30,true,3,false,0.0003686,"[4.96979300425396, 1.1912045293502247]","[-0.4358533131420818, 0.8416885155869507]",0
10,2.5430461941547175e-30,true,3,false,0.0004063,"[4.969793004253958, 1.191204529350225]","[-0.6746815710608132, 0.5703807098956672]",0
11,3.1270864177685133e-30,true,3,false,0.0002817,"[4.969793004253978, 1.1912045293502236]","[-0.637661275087828, 0.3134098736025502]",0
12,2.4809926934350844e-30,true,3,false,0.0002497,"[4.969793004253951, 1.1912045293502254]","[-0.45656749766775084, 0.6859254153377403]",0
13,7.367255564915703e-31,true,3,false,0.0002964,"[4.969793004253938, 1.1912045293502258]","[0.3690356139154637, 1.1767982541932573]",0
14,3.278693875234676e-30,true,3,false,0.0002763,"[4.969793004253954, 1.1912045293502251]","[0.1644377136788993, -1.0132404450445502]",0
15,7.367255564915703e-31,true,3,false,0.0002619,"[4.969793004253936, 1.1912045293502258]","[1.2594059388919372, 0.5668443814183669]",0
16,8.641601728373638e-31,true,3,false,0.0003081,"[4.969793004253933, 1.1912045293502262]","[0.9085651769028054, 0.7723646262391767]",0
17,1.1976327868854149e-30,true,3,false,0.0002527,"[4.969793004253946, 1.1912045293502256]","[1.651285961528619, -1.119582694672335]",0
18,2.824755159717008e-30,true,3,false,0.0029855,"[4.969793004253955, 1.191204529350225]","[1.0247173742143418, -0.012301729081761994]",0
19,8.641601728373638e-31,true,3,false,0.0003302,"[4.969793004253933, 1.1912045293502262]","[0.37133497370694063, 1.5754594070993344]",0
20,2.095024388009941e-30,true,3,false,0.0003107,"[4.969793004253937, 1.1912045293502262]","[0.23769577896671254, 0.11408936635142151]",0
3 changes: 2 additions & 1 deletion src/GMMTools.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ using StatsAPI

# for (Bayesian) bootstrap
using StatsBase # take samples
using Distributions # Dirichlet distribution
using Distributions # Dirichlet distribution for bootstrap

using DelimitedFiles
using CSV
using JSON

Expand Down
103 changes: 71 additions & 32 deletions src/functions_estimation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ end

Base.@kwdef mutable struct GMMOptions
path::String = ""
autodiff::Symbol = :none
optim_autodiff::Symbol = :none
optim_algo = LBFGS()
optim_opts = nothing
write_iter::Bool = false # write to file each result (each initial run)
clean_iter::Bool = false #
Expand All @@ -169,15 +170,17 @@ end
function default_gmm_opts(;
path = "",
optim_opts=default_optim_opts(),
autodiff=:none,
optim_autodiff=:none,
optim_algo=LBFGS(),
write_iter = false, # write to file each result (each initial run)
clean_iter = false, #
overwrite = false,
trace = 0)

return GMMOptions(
path=path,
autodiff=autodiff,
optim_autodiff=optim_autodiff,
optim_algo=optim_algo,
optim_opts=optim_opts,
write_iter=write_iter,
clean_iter=clean_iter,
Expand All @@ -188,34 +191,36 @@ end
# extent the "copy" method to the GMMOptions type
Base.copy(x::GMMOptions) = GMMOptions([getfield(x, k) for k fieldnames(GMMOptions)]...)

function write(est_result::GMMResult, opts::GMMOptions; subpath="", filename="")
function write(est_result::GMMResult, opts::GMMOptions, filename; subpath="")

if opts.path == ""
return
end

if subpath == ""
full_path = opts.path
else
(subpath[end] != "/") && (subpath *= "/")
(subpath[end] != '/') && (subpath *= "/")
full_path = opts.path * subpath
isdir(full_path) || mkdir(full_path)
end

(filename == "") ? filename = "results.csv" : nothing

file_path = full_path * filename

CSV.write(file_path, table(est_result))
CSV.write(full_path * filename, table(est_result))
end


function parse_vector(s::String)
function parse_vector(s::AbstractString)
return parse.(Float64, split(s[2:(end-1)],","))
end

function load(prob::GMMProblem, opts::GMMOptions, filepath::String)
function load(prob::GMMProblem, opts::GMMOptions; filepath="")

if filepath == ""
full_path = opts.path * "results.csv"
else
full_path = opts.path * filepath
end

full_path = opts.path * filepath
if isfile(full_path)
df = CSV.read(full_path, DataFrame)

Expand Down Expand Up @@ -255,6 +260,16 @@ function load(prob::GMMProblem, opts::GMMOptions, filepath::String)
end
end

function clean_iter(opts)
try
print("Deleting intermediate files from: ", opts.path)
rm(opts.path * "__iter__/", force=true, recursive=true)
println(" Done.")
catch e
println(" Error while deleting intermediate files from : ", opts.path, ". Error: ", e)
end
end

function add_nobs(myfit::GMMResult, problem::GMMProblem, mom_fn::Function)
try
myfit.N = size(problem.data, 1)
Expand Down Expand Up @@ -321,32 +336,46 @@ function fit_twostep(
run_parallel=true,
opts=default_gmm_opts())

# Step 1
main_path = opts.path
(main_path[end] != '/') && (main_path *= "/")

### Step 1
(opts.trace > 0) && println(">>> Starting GMM step 1.")
opts.path = main_path * "step1/"
isdir(opts.path) || mkdir(opts.path)

fit_step1 = fit_onestep(
problem,
mom_fn,
run_parallel=run_parallel,
opts=opts)

# optimal weight matrix
### optimal weight matrix
m = mom_fn(problem, fit_step1.theta_hat)
nmomsize = size(m, 1)
println("number of observations: ", nmomsize)
# (opts.trace > 0) && println("number of observations: ", nmomsize)

Wstep2 = Hermitian(transpose(m) * m / nmomsize)
Wstep2 = inv(Wstep2)
problem.W = Wstep2

display(Wstep2)
# Save Wstep2 to file
writedlm( opts.path * "Wstep2.csv", Wstep2, ',')

# TODO: save Wstep2 to file
### Step 2
(opts.trace > 0) && println(">>> Starting GMM step 2.")
opts.path = main_path * "step2/"
isdir(opts.path) || mkdir(opts.path)

fit_step2 = fit_onestep(
problem,
mom_fn,
run_parallel=run_parallel,
opts=opts)

# revert
opts.path = main_path

return fit_step2
end

Expand All @@ -363,11 +392,15 @@ function fit_onestep(
# skip if output file already exists
if !opts.overwrite && (opts.path != "")

opt_results_from_file = load(problem, opts, "results.csv")
opt_results_from_file = load(problem, opts)

if !isnothing(opt_results_from_file)
println(" Results already exist. Reading from file.")
println(" Results file already exists. Reading from file.")
add_nobs(opt_results_from_file, problem, mom_fn)

# delete all intermediate files with individual iteration results
opts.clean_iter && clean_iter(opts)

return opt_results_from_file
end
end
Expand All @@ -388,14 +421,10 @@ function fit_onestep(
add_nobs(best_result, problem, mom_fn)

# save results to file?
(opts.path != "") && write(best_result, opts, filename="results.csv")
(opts.path != "") && write(best_result, opts, "results.csv")

# delete all intermediate files with individual iteration results
if opts.clean_iter
print("Deleting intermediate files...")
rm(opts.path * "__iter__/", force=true, recursive=true)
println(" Done.")
end
opts.clean_iter && clean_iter(opts)

return best_result
end
Expand All @@ -412,7 +441,7 @@ function fit_onerun(idx::Int64, problem::GMMProblem, mom_fn::Function, opts::GMM
# skip if output file already exists
if !opts.overwrite && (opts.path != "")

opt_results_from_file = load(problem, opts, "__iter__/results_" * string(idx) * ".csv")
opt_results_from_file = load(problem, opts, filepath="__iter__/results_" * string(idx) * ".csv")

if !isnothing(opt_results_from_file)
(opts.trace > 0) && println(" Reading from file.")
Expand All @@ -427,11 +456,21 @@ function fit_onerun(idx::Int64, problem::GMMProblem, mom_fn::Function, opts::GMM
gmm_objective_loaded = theta -> gmm_objective(theta, problem, mom_fn, trace=opts.trace)

# optimize
if opts.autodiff == :forward
println("using AD")
time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, theta0, LBFGS(), opts.optim_opts, autodiff=:forward)
if opts.optim_autodiff == :forward
(opts.trace > 0) && print("using AD")
time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded,
theta0,
opts.optim_algo, # defalut = LBFGS()
opts.optim_opts,
autodiff=:forward)

# results = optimize(f, g!, lower, upper, initial_x, Fminbox(GradientDescent()), Optim.Options(outer_iterations = 2))

else
time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded, theta0, opts.optim_opts)
time_it_took = @elapsed raw_opt_results = Optim.optimize(gmm_objective_loaded,
theta0,
opts.optim_algo, # defalut = LBFGS()
opts.optim_opts)
end
#=
summary(res)
Expand Down Expand Up @@ -461,7 +500,7 @@ function fit_onerun(idx::Int64, problem::GMMProblem, mom_fn::Function, opts::GMM
# write intermediate results to file
if opts.write_iter
(opts.trace > 0) && println(" Done and done writing to file.")
write(opt_results, opts, subpath="__iter__", filename="results_" * string(idx) * ".csv")
write(opt_results, opts, "results_" * string(idx) * ".csv", subpath="__iter__")
else
(opts.trace > 0) && println(" Done. ")
end
Expand Down

0 comments on commit c51986d

Please sign in to comment.