Merge remote-tracking branch 'upstream/master' into update_calib

PSLmodels · Aug 27, 2024 · d024d5c · d024d5c
2 parents 9411acf + 42a4ff8
commit d024d5c
Show file tree

Hide file tree

Showing 12 changed files with 70 additions and 86 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,11 +5,17 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.12] - 2024-08-26 12:00:00
+
+### Added
+
+- Streamlined the `run_og_usa.py` script to make the example more clear, run faster, and save output in a common directory.
+
 ## [0.1.11] - 2024-07-26 12:00:00
 
 ### Added
 
-- Adds a module to update Tax-Calculator growth factors using OG-USA simualtions.
+- Adds a module to update Tax-Calculator growth factors using OG-USA simulations.
 
 
 ## [0.1.10] - 2024-06-10 12:00:00
@@ -124,7 +130,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Any earlier versions of OG-USA can be found in the [`OG-Core`](https://github.com/PSLmodels/OG-Core) repository [release history](https://github.com/PSLmodels/OG-Core/releases) from [v.0.6.4](https://github.com/PSLmodels/OG-Core/releases/tag/v0.6.4) (Jul. 20, 2021) or earlier.
 
 
-
+[0.1.12]: https://github.com/PSLmodels/OG-USA/compare/v0.1.11...v0.1.12
+[0.1.11]: https://github.com/PSLmodels/OG-USA/compare/v0.1.10...v0.1.11
 [0.1.10]: https://github.com/PSLmodels/OG-USA/compare/v0.1.9...v0.1.10
 [0.1.9]: https://github.com/PSLmodels/OG-USA/compare/v0.1.8...v0.1.9
 [0.1.8]: https://github.com/PSLmodels/OG-USA/compare/v0.1.7...v0.1.8

diff --git a/environment.yml b/environment.yml
@@ -13,7 +13,8 @@ dependencies:
 - dask>=2.30.0
 - dask-core>=2.30.0
 - distributed>=2.30.1
-- paramtools>=0.15.0
+- "marshmallow<3.22"    # to work around paramtools bug
+- "paramtools>=0.18.2"  # requires marshmallow>=3.0
 - taxcalc>=3.0.0
 - sphinx>=3.5.4
 - sphinx-book-theme>=0.1.3

diff --git a/examples/run_og_usa.py b/examples/run_og_usa.py
@@ -3,6 +3,8 @@
 import os
 import json
 import time
+import importlib.resources
+import copy
 from taxcalc import Calculator
 import matplotlib.pyplot as plt
 from ogusa.calibrate import Calibration
@@ -28,8 +30,9 @@ def main():
 
     # Directories to save data
     CUR_DIR = os.path.dirname(os.path.realpath(__file__))
-    base_dir = os.path.join(CUR_DIR, "OG-USA-Example", "OUTPUT_BASELINE")
-    reform_dir = os.path.join(CUR_DIR, "OG-USA-Example", "OUTPUT_REFORM")
+    save_dir = os.path.join(CUR_DIR, "OG-USA-Example")
+    base_dir = os.path.join(save_dir, "OUTPUT_BASELINE")
+    reform_dir = os.path.join(save_dir, "OUTPUT_REFORM")
 
     """
     ------------------------------------------------------------------------
@@ -44,22 +47,13 @@ def main():
         output_base=base_dir,
     )
     # Update parameters for baseline from default json file
-    p.update_specifications(
-        json.load(
-            open(
-                os.path.join(
-                    CUR_DIR, "..", "ogusa", "ogusa_default_parameters.json"
-                )
-            )
-        )
-    )
-    p.tax_func_type = "GS"
-    p.age_specific = False
+    with importlib.resources.open_text(
+        "ogusa", "ogusa_default_parameters.json"
+    ) as file:
+        defaults = json.load(file)
+    p.update_specifications(defaults)
+    p.tax_func_type = "HSV"
     c = Calibration(p, estimate_tax_functions=True, client=client)
-    # close and delete client bc cache is too large
-    client.close()
-    del client
-    client = Client(n_workers=num_workers, threads_per_worker=1)
     d = c.get_dict()
     # # additional parameters to change
     updated_params = {
@@ -88,43 +82,23 @@ def main():
     # In this example the 'reform' is a change to 2017 law (the
     # baseline policy is tax law in 2018)
     reform_url = (
-        "github://PSLmodels:examples@main/psl_examples/"
-        + "taxcalc/2017_law.json"
+        "github://PSLmodels:Tax-Calculator@master/taxcalc/"
+        + "reforms/2017_law.json"
     )
+
     ref = Calculator.read_json_param_objects(reform_url, None)
     iit_reform = ref["policy"]
 
     # create new Specifications object for reform simulation
-    p2 = Specifications(
-        baseline=False,
-        num_workers=num_workers,
-        baseline_dir=base_dir,
-        output_base=reform_dir,
-    )
-    # Update parameters for baseline from default json file
-    p2.update_specifications(
-        json.load(
-            open(
-                os.path.join(
-                    CUR_DIR, "..", "ogusa", "ogusa_default_parameters.json"
-                )
-            )
-        )
-    )
-    p2.tax_func_type = "GS"
-    p2.age_specific = False
+    p2 = copy.deepcopy(p)
     # Use calibration class to estimate reform tax functions from
     # Tax-Calculator, specifying reform for Tax-Calculator in iit_reform
     c2 = Calibration(
         p2, iit_reform=iit_reform, estimate_tax_functions=True, client=client
     )
-    # close and delete client bc cache is too large
-    client.close()
-    del client
-    client = Client(n_workers=num_workers, threads_per_worker=1)
     # update tax function parameters in Specifications Object
     d = c2.get_dict()
-    # # additional parameters to change
+    # additional parameters to change
     updated_params = {
         "cit_rate": [[0.35]],
         "etr_params": d["etr_params"],
@@ -168,7 +142,7 @@ def main():
     op.plot_all(
         base_dir,
         reform_dir,
-        os.path.join(CUR_DIR, "OG-USA_example_plots_tables"),
+        os.path.join(save_dir, "OG-USA_example_plots_tables"),
     )
     # Create CSV file with output
     ot.tp_output_dump_table(
@@ -178,7 +152,7 @@ def main():
         reform_tpi,
         table_format="csv",
         path=os.path.join(
-            CUR_DIR,
+            save_dir,
             "OG-USA_example_plots_tables",
             "macro_time_series_output.csv",
         ),
@@ -188,7 +162,7 @@ def main():
     # save percentage change output to csv file
     ans.to_csv(
         os.path.join(
-            CUR_DIR, "OG-USA_example_plots_tables", "ogusa_example_output.csv"
+            save_dir, "OG-USA_example_plots_tables", "ogusa_example_output.csv"
         )
     )
 

diff --git a/ogusa/__init__.py b/ogusa/__init__.py
@@ -11,4 +11,4 @@
 from ogusa.utils import *
 from ogusa.wealth import *
 
-__version__ = "0.1.11"
+__version__ = "0.1.12"
diff --git a/ogusa/calibrate.py b/ogusa/calibrate.py
@@ -6,7 +6,6 @@
 from taxcalc import Records
 from ogcore import txfunc, demographics
 from ogcore.utils import safe_read_pickle, mkdirs
-import pkg_resources
 
 
 class Calibration:

diff --git a/ogusa/get_micro_data.py b/ogusa/get_micro_data.py
@@ -12,7 +12,7 @@
 import numpy as np
 import os
 import pickle
-import pkg_resources
+import importlib.metadata
 from ogcore import utils
 from ogusa.constants import DEFAULT_START_YEAR, TC_LAST_YEAR
 
@@ -183,7 +183,7 @@ def get_data(
     del results
 
     # Pull Tax-Calc version for reference
-    taxcalc_version = pkg_resources.get_distribution("taxcalc").version
+    taxcalc_version = importlib.metadata.version("taxcalc")
 
     return micro_data_dict, taxcalc_version
 
@@ -263,7 +263,8 @@ def taxcalc_advance(
         "total_tax_liab": calc1.array("combined"),
         "payroll_tax_liab": calc1.array("payrolltax"),
         "etr": (
-            (calc1.array("combined") - calc1.array("ubi")) / market_income
+            (calc1.array("combined") - calc1.array("ubi"))
+            / np.maximum(market_income, 1)
         ),
         "year": calc1.current_year * np.ones(length),
         "weight": calc1.array("s006"),

diff --git a/ogusa/macro_params.py b/ogusa/macro_params.py
@@ -106,7 +106,9 @@ def get_macro_params():
 
     # find g_y
     macro_parameters["g_y"] = (
-        fred_data_q["GDP Per Capita"].pct_change(periods=4, freq="QE").mean()
+        fred_data_q["GDP Per Capita"]
+        .pct_change(periods=4, freq="QE", fill_method=None)
+        .mean()
     )
 
     # # estimate r_gov_shift and r_gov_scale

diff --git a/ogusa/psid_data_setup.py b/ogusa/psid_data_setup.py
@@ -12,7 +12,7 @@
     # This is the case when a separate script is calling this function in
     # this module
     CURDIR = os.path.split(os.path.abspath(__file__))[0]
-except:
+except NameError:
     # This is the case when a Jupyter notebook is calling this function
     CURDIR = os.getcwd()
 output_fldr = "io_files"
@@ -54,11 +54,13 @@ def prep_data(
     # SRC sample families have 1968 family interview numbers less than 3000
     raw_df = raw_df[raw_df["ID1968"] < 3000].copy()
 
-    raw_df["relation.head"][
-        (raw_df["year"] < 1983) & (raw_df["relation.head"] == 1)
+    raw_df.loc[
+        raw_df.index[(raw_df["year"] < 1983) & (raw_df["relation.head"] == 1)],
+        "relation.head",
     ] = 10
-    raw_df["relation.head"][
-        (raw_df["year"] < 1983) & (raw_df["relation.head"] == 2)
+    raw_df.loc[
+        raw_df.index[(raw_df["year"] < 1983) & (raw_df["relation.head"] == 2)],
+        "relation.head",
     ] = 20
     head_df = raw_df.loc[
         raw_df.index[
@@ -123,7 +125,7 @@ def prep_data(
     # pull series of interest using pandas_datareader
     fred_data = web.DataReader(["CPIAUCSL"], "fred", start, end)
     # Make data annual by averaging over months in year
-    fred_data = fred_data.resample("A").mean()
+    fred_data = fred_data.resample("YE").mean()
     fred_data["year_data"] = fred_data.index.year
     psid_df2 = psid_df.merge(fred_data, how="left", on="year_data")
     psid_df = psid_df2
@@ -275,15 +277,11 @@ def prep_data(
     # Backfill and then forward fill variables that are constant over time
     # within hhid
     for item in PSID_CONSTANT_VARS:
-        rebalanced_data[item] = rebalanced_data.groupby("hh_id")[item].fillna(
-            method="bfill"
-        )
-        rebalanced_data[item] = rebalanced_data.groupby("hh_id")[item].fillna(
-            method="ffill"
-        )
+        rebalanced_data[item] = rebalanced_data.groupby("hh_id")[item].bfill()
+        rebalanced_data[item] = rebalanced_data.groupby("hh_id")[item].ffill()
 
     ### NOTE: we seem to get some cases where the marital status is not constant
-    # despite trying to set up the indentifcation of a household such that it
+    # despite trying to set up the identification of a household such that it
     # has to be.  Why this is happening needs to be checked.
 
     # Fill in year by doing a cumulative counter within each hh_id and then

diff --git a/ogusa/utils.py b/ogusa/utils.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import numpy as np
-from scipy.stats import kde
+from scipy.stats import gaussian_kde
 import matplotlib.pyplot as plt
 import requests
 import urllib3
@@ -28,6 +28,7 @@ def read_cbo_forecast():
             & (pd.isnull(df["Unnamed: 2"]))
         )
     ]
+    # df.fillna(value=np.nan, inplace=True)
     df.fillna(value="", inplace=True)
     df["full_var_name"] = (
         df["Unnamed: 0"] + df["Unnamed: 1"] + df["Unnamed: 2"]
@@ -203,7 +204,7 @@ def MVKDE(
         k += 1
 
     freq_mat = np.vstack((age_frequency, income_frequency)).T
-    density = kde.gaussian_kde(freq_mat.T, bw_method=bandwidth)
+    density = gaussian_kde(freq_mat.T, bw_method=bandwidth)
     age_min, income_min = freq_mat.min(axis=0)
     age_max, income_max = freq_mat.max(axis=0)
     agei, incomei = np.mgrid[

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="ogusa",
-    version="0.1.11",
+    version="0.1.12",
     author="Jason DeBacker and Richard W. Evans",
     license="CC0 1.0 Universal (CC0 1.0) Public Domain Dedication",
     description="USA calibration for OG-Core",

diff --git a/tests/test_calibrate.py b/tests/test_calibrate.py
@@ -30,6 +30,8 @@ def test_read_tax_func_estimate_error():
 def test_read_tax_func_estimate():
     p = ogcore.Specifications()
     p.BW = 11
+    p.tax_func_type = "DEP"
+    p.start_year = 2021
     tax_func_path = os.path.join(
         CUR_PATH, "test_io_data", "TxFuncEst_policy.pkl"
     )

diff --git a/tests/test_get_micro_data.py b/tests/test_get_micro_data.py
@@ -219,8 +219,6 @@ def test_get_calculator_puf_from_file():
 def test_get_data(baseline, dask_client):
     """
     Test of get_micro_data.get_data() function
-
-    Note that this test may fail if the Tax-Calculator is not v 3.2.2
     """
     expected_data = utils.safe_read_pickle(
         os.path.join(CUR_PATH, "test_io_data", "micro_data_dict_for_tests.pkl")
@@ -238,44 +236,45 @@ def test_get_data(baseline, dask_client):
     test_data2 = {x: test_data[x] for x in keys}
     for k, v in test_data2.items():
         try:
-            assert_frame_equal(expected_data[k], v)
+            # check that columns are the same
+            assert set(expected_data[k].columns) == set(v.columns)
+            # check that test data returns some non-zero values
+            assert v.count().sum() > 0
         except KeyError:
             pass
 
 
 def test_taxcalc_advance():
     """
     Test of the get_micro_data.taxcalc_advance() function
-
-    Note that this test may fail if the Tax-Calculator is not v 3.2.1
     """
     expected_dict = utils.safe_read_pickle(
         os.path.join(CUR_PATH, "test_io_data", "tax_dict_for_tests.pkl")
     )
     test_dict = get_micro_data.taxcalc_advance(
         2028, {}, {}, "cps", None, None, 2014, 2028
     )
-    for k, v in test_dict.items():
-        assert np.allclose(expected_dict[k], v, equal_nan=True)
+    # check that keys are the same
+    assert set(expected_dict.keys()) == set(test_dict.keys())
+    for _, v in test_dict.items():
+        # check that test data returns some non-zero values
+        assert np.count_nonzero(v) > 0
 
 
 @pytest.mark.local
 def test_cap_inc_mtr():
     """
     Test of the get_micro_data.cap_inc_mtr() function
-
-    Note that this test may fail if the Tax-Calculator is not v 3.2.1
     """
     calc1 = get_micro_data.get_calculator(
         calculator_start_year=2028, iit_reform={}, data="cps"
     )
     calc1.advance_to_year(2028)
-    expected = np.genfromtxt(
-        os.path.join(
-            CUR_PATH, "test_io_data", "mtr_combined_capinc_for_tests.csv"
-        ),
-        delimiter=",",
-    )
     test_data = get_micro_data.cap_inc_mtr(calc1)
 
-    assert np.allclose(expected, test_data, equal_nan=True)
+    # check that test data returns some non-zero values
+    assert np.count_nonzero(test_data) > 0
+    # assert mtrs < 1
+    assert test_data.max() < 1
+    # assert mtrs > -1
+    assert test_data.min() > -1