sync w master and resolve conflict

PSLmodels · Jun 19, 2024 · 579c526 · 579c526
2 parents 887afb7 + acb734c
commit 579c526
Show file tree

Hide file tree

Showing 29 changed files with 574 additions and 109,395 deletions.
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
@@ -27,7 +27,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-latest]
+        os: [ubuntu-latest, macos-latest, windows-latest]
         python-version: ["3.10", "3.11"]
 
     steps:
@@ -52,9 +52,9 @@ jobs:
         shell: bash -l {0}
         working-directory: ./
         run: |
-          python -m pytest -m "not local" --cov=./ --cov-report=xml
+          python -m pytest -m "not local and not needs_puf and not needs_tmd" --cov=./ --cov-report=xml
       - name: Upload coverage to Codecov
-        if: matrix.os == 'ubuntu-latest'
+        if: matrix.os == 'ubuntu-latest' && contains(github.repository, 'PSLmodels/OG-USA')
         uses: codecov/codecov-action@v4
         with:
           files: ./coverage.xml

diff --git a/.gitignore b/.gitignore
@@ -24,7 +24,6 @@ htmlcov/*
 *.asv
 *.nav
 *.snm
-*.gz
 *.bib.bak
 *.fls
 *.m~
@@ -47,6 +46,7 @@ examples/OG-USA-Example/*
 cs-config/cs_config/OUTPUT_BASELINE/*
 data/csv_output_files/*
 data/images/*
+data/PSID/psid_lifetime_income.csv
 ogusa/csv_output_files/*
 ogusa/images/*
 .vscode/

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,35 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+
+## [0.1.10] - 2024-06-10 12:00:00
+
+### Added
+
+- Removes the `rpy2` dependency from the `environment.yml` and `setup.py` files, and modifies use of PSID data to avoid needing this package in OG-USA.
+
+
+## [0.1.9] - 2024-06-07 12:00:00
+
+### Added
+
+- Updates the `get_micro_data.py` and `calibration.py` modules to allow for the user to use the CPS, PUF, and TMD files with Tax-Calculator or to provide their own custom datafile, with associated grow factors and weights.
+
+
+## [0.1.8] - 2024-05-20 12:00:00
+
+### Added
+
+- Updates the `ogusa` package to include the zipped `psid_lifetime_income.csv.gz` file, which is now called in some calibration modules (`bequest_transmission.py`,  `deterministic_profiles.py`, and `transfer_distirbution.py`), but with an option for the user to provide their own custom datafile.  These changes allow for Jupyter notebook users to execute the `Calibration` class object and for those who install the `ogusa` package from PyPI to have the required datafile for the major calibration modules.
+
+
+## [0.1.7] - 2024-05-14 16:30:00
+
+### Added
+
+- Updates the dependency `rpy2>=3.5.12` in `environment.yml` and `setup.py`.
+
+
 ## [0.1.6] - 2024-05-08 10:30:00
 
 ### Added
@@ -90,6 +119,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 
 
+[0.1.10]: https://github.com/PSLmodels/OG-USA/compare/v0.1.9...v0.1.10
+[0.1.9]: https://github.com/PSLmodels/OG-USA/compare/v0.1.8...v0.1.9
+[0.1.8]: https://github.com/PSLmodels/OG-USA/compare/v0.1.7...v0.1.8
+[0.1.7]: https://github.com/PSLmodels/OG-USA/compare/v0.1.6...v0.1.7
 [0.1.6]: https://github.com/PSLmodels/OG-USA/compare/v0.1.5...v0.1.6
 [0.1.5]: https://github.com/PSLmodels/OG-USA/compare/v0.1.4...v0.1.5
 [0.1.4]: https://github.com/PSLmodels/OG-USA/compare/v0.1.3...v0.1.4

diff --git a/cs-config/cs_config/functions.py b/cs-config/cs_config/functions.py
@@ -1,3 +1,4 @@
+import ogusa
 from ogusa.calibrate import Calibration
 from ogcore.parameters import Specifications
 from ogusa.constants import (
@@ -14,18 +15,22 @@
 import pickle
 import json
 import inspect
+import pandas as pd
 import paramtools
 from distributed import Client
-from taxcalc import Policy
+from taxcalc import Policy, Records, GrowFactors
 from collections import OrderedDict
-from .helpers import retrieve_puf
+from .helpers import retrieve_puf, retrieve_tmd
 from cs2tc import convert_policy_adjustment
 
 AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID", "")
 AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", "")
 PUF_S3_FILE_LOCATION = os.environ.get(
     "PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
 )
+TMD_S3_FILE_LOCATION = os.environ.get(
+    "TMD_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
+)
 CUR_DIR = os.path.dirname(os.path.realpath(__file__))
 
 # Get Tax-Calculator default parameters
@@ -78,7 +83,7 @@ class MetaParams(paramtools.Parameters):
 
 
 def get_version():
-    return "0.1.2"
+    return ogusa.__version__
 
 
 def get_inputs(meta_param_dict):
@@ -188,16 +193,46 @@ def run_model(meta_param_dict, adjustment):
 
     meta_params = MetaParams()
     meta_params.adjust(meta_param_dict)
+    # Get data chosen by user
     if meta_params.data_source == "PUF":
         data = retrieve_puf(
             PUF_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
         )
+        weights = Records.PUF_WEIGHTS_FILENAME
+        records_start_year = Records.PUFCSV_YEAR
         # set name of cached baseline file in case use below
         cached_pickle = "TxFuncEst_baseline_PUF.pkl"
-    else:
+        if data is not None:
+            if not isinstance(data, pd.DataFrame):
+                raise TypeError("'data' must be a Pandas DataFrame.")
+        else:
+            # Access keys are not available. Default to the CPS.
+            print("Defaulting to the CPS")
+            meta_params.adjust({"data_source": "CPS"})
+    elif meta_params.data_source == "TMD":
+        data = retrieve_tmd(
+            TMD_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
+        )
+        weights = Records.TMD_WEIGHTS_FILENAME
+        records_start_year = Records.TMDCSV_YEAR
+        if data is not None:
+            if not isinstance(data, pd.DataFrame):
+                raise TypeError("'data' must be a Pandas DataFrame.")
+        else:
+            # Access keys are not available. Default to the CPS.
+            print("Defaulting to the CPS")
+            meta_params.adjust({"data_source": "CPS"})
+    elif meta_params.data_source == "CPS":
         data = "cps"
+        weights = Records.PUF_WEIGHTS_FILENAME
+        records_start_year = Records.CPSCSV_YEAR
         # set name of cached baseline file in case use below
         cached_pickle = "TxFuncEst_baseline_CPS.pkl"
+    else:
+        raise ValueError(
+            f"Data source '{meta_params.data_source}' is not supported."
+        )
+
     # Get TC params adjustments
     iit_mods = convert_policy_adjustment(
         adjustment["Tax-Calculator Parameters"]
@@ -211,7 +246,7 @@ def run_model(meta_param_dict, adjustment):
 
     # Dask parmeters
     num_workers = 2
-    memory_limit = "10GiB"
+    memory_per_worker = "10GiB"
     client = Client(
         n_workers=num_workers,
         threads_per_worker=1,
@@ -222,8 +257,7 @@ def run_model(meta_param_dict, adjustment):
     # num_workers_txf = 5
     # num_workers_mod = 6
 
-    # whether to estimate tax functions from microdata
-    run_micro = True
+    # Read in whether user chose to solve for transition path
     time_path = meta_param_dict["time_path"][0]["value"]
 
     # filter out OG-USA params that will not change between baseline and
@@ -363,6 +397,9 @@ def run_model(meta_param_dict, adjustment):
         iit_reform=iit_mods,
         estimate_tax_functions=True,
         data=data,
+        gfactors=GrowFactors.FILE_NAME,
+        weights=weights,
+        records_start_year=records_start_year,
         client=client,
     )
     # update tax function parameters in Specifications Object

diff --git a/cs-config/cs_config/helpers.py b/cs-config/cs_config/helpers.py
@@ -24,6 +24,9 @@
 PUF_S3_FILE_LOCATION = os.environ.get(
     "PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
 )
+TMD_S3_FILE_LOCATION = os.environ.get(
+    "TMD_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
+)
 TC_LAST_YEAR = Policy.LAST_BUDGET_YEAR
 
 POLICY_SCHEMA = {
@@ -120,3 +123,40 @@ def retrieve_puf(
             f"s3_reader_installed={s3_reader_installed})"
         )
         return None
+
+
+def retrieve_tmd(
+    tmd_s3_file_location=TMD_S3_FILE_LOCATION,
+    aws_access_key_id=AWS_ACCESS_KEY_ID,
+    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+):
+    """
+    Function for retrieving the TMD from the S3 bucket
+    """
+    s3_reader_installed = S3FileSystem is not None
+    has_credentials = (
+        aws_access_key_id is not None and aws_secret_access_key is not None
+    )
+    if tmd_s3_file_location and has_credentials and s3_reader_installed:
+        print("Reading tmd from S3 bucket.", tmd_s3_file_location)
+        fs = S3FileSystem(
+            key=AWS_ACCESS_KEY_ID,
+            secret=AWS_SECRET_ACCESS_KEY,
+        )
+        with fs.open(tmd_s3_file_location) as f:
+            # Skips over header from top of file.
+            tmd_df = pd.read_csv(f)
+        return tmd_df
+    elif Path("tmd.csv.gz").exists():
+        print("Reading tmd from tmd.csv.gz.")
+        return pd.read_csv("tmd.csv.gz", compression="gzip")
+    elif Path("tmd.csv").exists():
+        print("Reading tmd from tmd.csv.")
+        return pd.read_csv("tmd.csv")
+    else:
+        warnings.warn(
+            f"TMD file not available (tmd_location={tmd_s3_file_location}, "
+            f"has_credentials={has_credentials}, "
+            f"s3_reader_installed={s3_reader_installed})"
+        )
+        return None
diff --git a/data/PSID/psid1968to2015.csv.gz b/data/PSID/psid1968to2015.csv.gz
diff --git a/data/PSID/psid1968to2017.RData → data/PSID/psid1968to2017.csv.gz b/data/PSID/psid1968to2017.RData → data/PSID/psid1968to2017.csv.gz
diff --git a/data/PSID/psid_download.R b/data/PSID/psid_download.R
@@ -176,4 +176,5 @@ for (var in names(ind_var_names)){
 print('Beginning to build panel')
 # Build PSID panel
 psid_df <- build.panel(datadir=mydir, fam.vars=famvars, ind.vars=indvars, sample="SRC", design='all')
-save(psid_df,file=file.path(script.dir, 'psid1968to2017.RData'))
+
+write.csv(psid_df, file=gzfile("psid1968to2017.csv.gz"))