diff --git a/spreg/diagnostics_sp.py b/spreg/diagnostics_sp.py index a3aaeaac..bf0dbfe7 100644 --- a/spreg/diagnostics_sp.py +++ b/spreg/diagnostics_sp.py @@ -18,7 +18,6 @@ class LMtests: - """ Lagrange Multiplier tests. Implemented as presented in :cite:`Anselin1996a` @@ -142,7 +141,6 @@ def __init__(self, ols, w, tests=["all"]): class MoranRes: - """ Moran's I for spatial autocorrelation in residuals from OLS regression @@ -396,7 +394,6 @@ def __init__(self, iv, w, case="nosp"): class spDcache: - """ Helper class to compute reusable pieces in the spatial diagnostics module ... @@ -669,7 +666,7 @@ def get_vI(ols, w, ei, spDcache): B = spDcache.AB[1] trB = np.sum(B.diagonal()) * 4.0 vi = (w.n ** 2 / (w.s0 ** 2 * (w.n - ols.k) * (w.n - ols.k + 2.0))) * ( - w.s1 + 2.0 * trA2 - trB - ((2.0 * (spDcache.trA ** 2)) / (w.n - ols.k)) + w.s1 + 2.0 * trA2 - trB - ((2.0 * (spDcache.trA ** 2)) / (w.n - ols.k)) ) return vi @@ -716,9 +713,6 @@ def akTest(iv, w, spDcache): p : float P-value of the test - ToDo: - * Code in as Nancy - * Compare both """ mi = get_mI(iv, w, spDcache) # Phi2 @@ -731,6 +725,42 @@ def akTest(iv, w, spDcache): return (mi, ak[0][0], pval[0][0]) +def comfac_test(lambd, beta, gamma, vm): + """ + Computes the Spatial Common Factor Hypothesis test as shown in Anselin (1988, p. 226-229) + + Parameters + ---------- + + lambd : float + Spatial autoregressive coefficient (as in lambd*Wy) + beta : array + Coefficients of the exogenous (not spatially lagged) variables, without the constant (as in X*beta) + gamma : array + coefficients of the spatially lagged exogenous variables (as in WX*gamma) + vm : array + Variance-covariance matrix of the coefficients + Obs. Needs to match the order of theta' = [beta', gamma', lambda] + + Returns + ------- + W : float + Wald statistic + pvalue : float + P value for Wald statistic calculated as a Chi sq. distribution + with k-1 degrees of freedom + + """ + g = lambd * beta + gamma + G = np.vstack((lambd * np.eye(beta.shape[0]), np.eye(beta.shape[0]), beta.T)) + + GVGi = la.inv(np.dot(G.T, np.dot(vm, G))) + W = np.dot(g.T, np.dot(GVGi, g))[0][0] + df = G.shape[1] + pvalue = chisqprob(W, df) + return W, pvalue + + def _test(): import doctest diff --git a/spreg/error_sp.py b/spreg/error_sp.py index 5fe47e00..765fddc8 100644 --- a/spreg/error_sp.py +++ b/spreg/error_sp.py @@ -9,23 +9,16 @@ import numpy as np from numpy import linalg as la from . import ols as OLS -from libpysal.weights.spatial_lag import lag_spatial -from .utils import power_expansion, set_endog, iter_msg, sp_att -from .utils import ( - get_A1_hom, - get_A2_hom, - get_A1_het, - optim_moments, - get_spFilter, - get_lags, - _moments2eqs, -) -from .utils import spdot, RegressionPropsY, set_warn +from .utils import set_endog, sp_att, optim_moments, get_spFilter, get_lags, spdot, RegressionPropsY, set_warn from . import twosls as TSLS from . import user_output as USER -from . import summary_output as SUMMARY +import pandas as pd +from .output import output, _spat_pseudo_r2 +from .error_sp_het import GM_Error_Het, GM_Endog_Error_Het, GM_Combo_Het +from .error_sp_hom import GM_Error_Hom, GM_Endog_Error_Hom, GM_Combo_Hom -__all__ = ["GM_Error", "GM_Endog_Error", "GM_Combo"] + +__all__ = ["GMM_Error", "GM_Error", "GM_Endog_Error", "GM_Combo"] class BaseGM_Error(RegressionPropsY): @@ -138,6 +131,9 @@ class GM_Error(BaseGM_Error): independent (exogenous) variable, excluding the constant w : pysal W object Spatial weights object (always needed) + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. vm : boolean If True, include variance-covariance matrix in summary results @@ -149,9 +145,13 @@ class GM_Error(BaseGM_Error): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -287,7 +287,7 @@ class GM_Error(BaseGM_Error): """ def __init__( - self, y, x, w, vm=False, name_y=None, name_x=None, name_w=None, name_ds=None + self, y, x, w, slx_lags=0, vm=False, name_y=None, name_x=None, name_w=None, name_ds=None, latex=False ): n = USER.check_arrays(y, x) @@ -295,14 +295,24 @@ def __init__( USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) + + self.title = "GM SPATIALLY WEIGHTED LEAST SQUARES" + if slx_lags >0: + lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (SDEM)" + BaseGM_Error.__init__(self, y=y, x=x_constant, w=w.sparse) - self.title = "SPATIALLY WEIGHTED LEAST SQUARES" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x_constant) self.name_x.append("lambda") self.name_w = USER.set_name_w(name_w, w) - SUMMARY.GM_Error(reg=self, w=w, vm=vm) + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * (len(self.name_x) - 1) + ['lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class BaseGM_Endog_Error(RegressionPropsY): @@ -379,7 +389,7 @@ class BaseGM_Endog_Error(RegressionPropsY): >>> w.transform='r' >>> model = spreg.error_sp.BaseGM_Endog_Error(y, x, yend, q, w=w.sparse) >>> np.around(model.betas, decimals=4) - array([[82.573 ], + array([[82.5723], [ 0.581 ], [-1.4481], [ 0.3499]]) @@ -438,6 +448,9 @@ class GM_Endog_Error(BaseGM_Endog_Error): this should not contain any variables from x) w : pysal W object Spatial weights object (always needed) + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. vm : boolean If True, include variance-covariance matrix in summary results @@ -453,9 +466,13 @@ class GM_Endog_Error(BaseGM_Endog_Error): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -603,12 +620,12 @@ class GM_Endog_Error(BaseGM_Endog_Error): >>> print(model.name_z) ['CONSTANT', 'inc', 'hoval', 'lambda'] >>> np.around(model.betas, decimals=4) - array([[82.573 ], + array([[82.5723], [ 0.581 ], [-1.4481], [ 0.3499]]) >>> np.around(model.std_err, decimals=4) - array([16.1381, 1.3545, 0.7862]) + array([16.1382, 1.3545, 0.7862]) """ @@ -619,6 +636,7 @@ def __init__( yend, q, w, + slx_lags=0, vm=False, name_y=None, name_x=None, @@ -626,6 +644,7 @@ def __init__( name_q=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x, yend, q) @@ -633,8 +652,13 @@ def __init__( USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) + self.title = "GM SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES" + if slx_lags >0: + lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (SDEM)" BaseGM_Endog_Error.__init__(self, y=y, x=x_constant, w=w.sparse, yend=yend, q=q) - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x_constant) @@ -644,137 +668,14 @@ def __init__( self.name_q = USER.set_name_q(name_q, q) self.name_h = USER.set_name_h(self.name_x, self.name_q) self.name_w = USER.set_name_w(name_w, w) - SUMMARY.GM_Endog_Error(reg=self, w=w, vm=vm) - - -class BaseGM_Combo(BaseGM_Endog_Error): - - """ - GMM method for a spatial lag and error model, with endogenous variables - (note: no consistency checks, diagnostics or constant added); based on - Kelejian and Prucha (1998, 1999) :cite:`Kelejian1998` :cite:`Kelejian1999`. - - Parameters - ---------- - y : array - nx1 array for dependent variable - x : array - Two dimensional array with n rows and one column for each - independent (exogenous) variable, excluding the constant - yend : array - Two dimensional array with n rows and one column for each - endogenous variable - q : array - Two dimensional array with n rows and one column for each - external exogenous variable to use as instruments (note: - this should not contain any variables from x) - w : Sparse matrix - Spatial weights sparse matrix - w_lags : integer - Orders of W to include as instruments for the spatially - lagged dependent variable. For example, w_lags=1, then - instruments are WX; if w_lags=2, then WX, WWX; and so on. - lag_q : boolean - If True, then include spatial lags of the additional - instruments (q). - - Attributes - ---------- - betas : array - kx1 array of estimated coefficients - u : array - nx1 array of residuals - e_filtered : array - nx1 array of spatially filtered residuals - predy : array - nx1 array of predicted y values - n : integer - Number of observations - k : integer - Number of variables for which coefficients are estimated - (including the constant) - y : array - nx1 array for dependent variable - x : array - Two dimensional array with n rows and one column for each - independent (exogenous) variable, including the constant - yend : array - Two dimensional array with n rows and one column for each - endogenous variable - z : array - nxk array of variables (combination of x and yend) - mean_y : float - Mean of dependent variable - std_y : float - Standard deviation of dependent variable - vm : array - Variance covariance matrix (kxk) - sig2 : float - Sigma squared used in computations + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * len(self.name_yend) + ['lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) - Examples - -------- - - >>> import numpy as np - >>> import libpysal - >>> import spreg - >>> db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') - >>> y = np.array(db.by_col("CRIME")) - >>> y = np.reshape(y, (49,1)) - >>> X = [] - >>> X.append(db.by_col("INC")) - >>> X = np.array(X).T - >>> w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) - >>> w.transform = 'r' - >>> w_lags = 1 - >>> yd2, q2 = spreg.set_endog(y, X, w, None, None, w_lags, True) - >>> X = np.hstack((np.ones(y.shape),X)) - - Example only with spatial lag - - >>> reg = spreg.error_sp.BaseGM_Combo(y, X, yend=yd2, q=q2, w=w.sparse) - - Print the betas - - >>> print(np.around(np.hstack((reg.betas[:-1],np.sqrt(reg.vm.diagonal()).reshape(3,1))),3)) - [[39.059 11.86 ] - [-1.404 0.391] - [ 0.467 0.2 ]] - - And lambda - - >>> print('Lamda: ', np.around(reg.betas[-1], 3)) - Lamda: [-0.048] - - Example with both spatial lag and other endogenous variables - - >>> X = [] - >>> X.append(db.by_col("INC")) - >>> X = np.array(X).T - >>> yd = [] - >>> yd.append(db.by_col("HOVAL")) - >>> yd = np.array(yd).T - >>> q = [] - >>> q.append(db.by_col("DISCBD")) - >>> q = np.array(q).T - >>> yd2, q2 = spreg.set_endog(y, X, w, yd, q, w_lags, True) - >>> X = np.hstack((np.ones(y.shape),X)) - >>> reg = spreg.error_sp.BaseGM_Combo(y, X, yd2, q2, w=w.sparse) - >>> betas = np.array([['CONSTANT'],['INC'],['HOVAL'],['W_CRIME']]) - >>> print(np.hstack((betas, np.around(np.hstack((reg.betas[:-1], np.sqrt(reg.vm.diagonal()).reshape(4,1))),4)))) - [['CONSTANT' '50.0944' '14.3593'] - ['INC' '-0.2552' '0.5667'] - ['HOVAL' '-0.6885' '0.3029'] - ['W_CRIME' '0.4375' '0.2314']] - - """ - - def __init__(self, y, x, yend=None, q=None, w=None, w_lags=1, lag_q=True): - - BaseGM_Endog_Error.__init__(self, y=y, x=x, w=w, yend=yend, q=q) - -class GM_Combo(BaseGM_Combo): +class GM_Combo(BaseGM_Endog_Error): """ GMM method for a spatial lag and error model with endogenous variables, @@ -801,9 +702,13 @@ class GM_Combo(BaseGM_Combo): Orders of W to include as instruments for the spatially lagged dependent variable. For example, w_lags=1, then instruments are WX; if w_lags=2, then WX, WWX; and so on. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the General Nesting + Spatial Model (GNSM) type. lag_q : boolean If True, then include spatial lags of the additional - instruments (q). + instruments (q) vm : boolean If True, include variance-covariance matrix in summary results @@ -819,9 +724,13 @@ class GM_Combo(BaseGM_Combo): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -1015,6 +924,7 @@ def __init__( q=None, w=None, w_lags=1, + slx_lags=0, lag_q=True, vm=False, name_y=None, @@ -1023,6 +933,7 @@ def __init__( name_q=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x, yend, q) @@ -1030,23 +941,23 @@ def __init__( USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) - yend2, q2 = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q) - BaseGM_Combo.__init__( - self, - y=y, - x=x_constant, - w=w.sparse, - yend=yend2, - q=q2, - w_lags=w_lags, - lag_q=lag_q, - ) + if slx_lags == 0: + yend2, q2 = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q) + else: + yend2, q2, wx = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q, slx_lags) + x_constant = np.hstack((x_constant, wx)) + + BaseGM_Endog_Error.__init__(self, y=y, x=x_constant, w=w.sparse, yend=yend2, q=q2) + self.rho = self.betas[-2] self.predy_e, self.e_pred, warn = sp_att( w, self.y, self.predy, yend2[:, -1].reshape(self.n, 1), self.rho ) set_warn(self, warn) - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES" + self.title = "SPATIALLY WEIGHTED 2SLS - GM-COMBO MODEL" + if slx_lags > 0: + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (GNSM)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x_constant) @@ -1058,7 +969,294 @@ def __init__( self.name_q.extend(USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q)) self.name_h = USER.set_name_h(self.name_x, self.name_q) self.name_w = USER.set_name_w(name_w, w) - SUMMARY.GM_Combo(reg=self, w=w, vm=vm) + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * (len(self.name_yend) - 1) + ['rho', 'lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _spat_pseudo_r2(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) + + +class GMM_Error(GM_Error, GM_Endog_Error, GM_Combo, GM_Error_Het, GM_Endog_Error_Het, + GM_Combo_Het, GM_Error_Hom, GM_Endog_Error_Hom, GM_Combo_Hom): + + """ + Wrapper function to call any of the GMM methods for a spatial error model available in spreg + + Parameters + ---------- + y : array + nx1 array for dependent variable + x : array + Two dimensional array with n rows and one column for each + independent (exogenous) variable, excluding the constant + w : pysal W object + Spatial weights object (always needed) + yend : array + Two dimensional array with n rows and one column for each + endogenous variable (if any) + q : array + Two dimensional array with n rows and one column for each + external exogenous variable to use as instruments (if any) + (note: this should not contain any variables from x) + estimator : string + Choice of estimator to be used. Options are: 'het', which + is robust to heteroskedasticity, 'hom', which assumes + homoskedasticity, and 'kp98', which does not provide + inference on the spatial parameter for the error term. + add_wy : boolean + If True, then a spatial lag of the dependent variable is included. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM or GNSM type. + vm : boolean + If True, include variance-covariance matrix in summary + results + name_y : string + Name of dependent variable for use in output + name_x : list of strings + Names of independent variables for use in output + name_w : string + Name of weights matrix for use in output + name_yend : list of strings + Names of endogenous variables for use in output + name_q : list of strings + Names of instruments for use in output + name_ds : string + Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format + **kwargs : keywords + Additional arguments to pass on to the estimators. + See the specific functions for details on what can be used. + + Attributes + ---------- + output : dataframe + regression results pandas dataframe + summary : string + Summary of regression results and diagnostics (note: use in + conjunction with the print command) + betas : array + kx1 array of estimated coefficients + u : array + nx1 array of residuals + e_filtered : array + nx1 array of spatially filtered residuals + predy : array + nx1 array of predicted y values + n : integer + Number of observations + k : integer + Number of variables for which coefficients are estimated + (including the constant) + y : array + nx1 array for dependent variable + x : array + Two dimensional array with n rows and one column for each + independent (exogenous) variable, including the constant + mean_y : float + Mean of dependent variable + std_y : float + Standard deviation of dependent variable + pr2 : float + Pseudo R squared (squared correlation between y and ypred) + vm : array + Variance covariance matrix (kxk) + sig2 : float + Sigma squared used in computations + std_err : array + 1xk array of standard errors of the betas + z_stat : list of tuples + z statistic; each tuple contains the pair (statistic, + p-value), where each is a float + name_y : string + Name of dependent variable for use in output + name_x : list of strings + Names of independent variables for use in output + name_w : string + Name of weights matrix for use in output + name_ds : string + Name of dataset for use in output + title : string + Name of the regression method used + name_yend : list of strings (optional) + Names of endogenous variables for use in output + name_z : list of strings (optional) + Names of exogenous and endogenous variables for use in + output + name_q : list of strings (optional) + Names of external instruments + name_h : list of strings (optional) + Names of all instruments used in ouput + + + Examples + -------- + + We first need to import the needed modules, namely numpy to convert the + data we read into arrays that ``spreg`` understands and ``libpysal`` to + handle the weights and file management. + + >>> import numpy as np + >>> import libpysal + >>> from libpysal.examples import load_example + + Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open(). + This is the DBF associated with the NAT shapefile. Note that + libpysal.io.open() also reads data in CSV format; since the actual class + requires data to be passed in as numpy arrays, the user can read their + data in using any method. + + >>> nat = load_example('Natregimes') + >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r') + + Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the + dependent variable for the regression. Note that PySAL requires this to be + an numpy array of shape (n, 1) as opposed to the also common shape of (n, ) + that other packages accept. + + >>> y_var = 'HR90' + >>> y = np.array([db.by_col(y_var)]).reshape(3085,1) + + Extract UE90 (unemployment rate) and PS90 (population structure) vectors from + the DBF to be used as independent variables in the regression. Other variables + can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...] + Note that PySAL requires this to be an nxj numpy array, where j is the + number of independent variables (not including a constant). By default + this model adds a vector of ones to the independent variables passed in. + + >>> x_var = ['PS90','UE90'] + >>> x = np.array([db.by_col(name) for name in x_var]).T + + Since we want to run a spatial error model, we need to specify + the spatial weights matrix that includes the spatial configuration of the + observations. To do that, we can open an already existing gal file or + create a new one. In this case, we will create one from ``NAT.shp``. + + >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp")) + + Unless there is a good reason not to do it, the weights have to be + row-standardized so every row of the matrix sums to one. Among other + things, this allows to interpret the spatial lag of a variable as the + average value of the neighboring observations. In PySAL, this can be + easily performed in the following way: + + >>> w.transform = 'r' + + The GMM_Error class can run error models and SARAR models, that is a spatial lag+error model. + In this example we will run a simple version of the latter, where we have the + spatial effects as well as exogenous variables. Since it is a spatial + model, we have to pass in the weights matrix. If we want to + have the names of the variables printed in the output summary, we will + have to pass them in as well, although this is optional. + + >>> from spreg import GMM_Error + >>> model = GMM_Error(y, x, w=w, add_wy=True, name_y=y_var, name_x=x_var, name_ds='NAT') + + Once we have run the model, we can explore a little bit the output. The + regression object we have created has many attributes so take your time to + discover them. + + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 CONSTANT 2.176007 1.115807 1.950165 0.051156 + 1 PS90 1.108054 0.207964 5.328096 0.0 + 2 UE90 0.664362 0.061294 10.83893 0.0 + 3 W_HR90 -0.066539 0.154395 -0.430964 0.666494 + 4 lambda 0.765087 0.04268 17.926245 0.0 + + This class also allows the user to run a spatial lag+error model with the + extra feature of including non-spatial endogenous regressors. This means + that, in addition to the spatial lag and error, we consider some of the + variables on the right-hand side of the equation as endogenous and we + instrument for this. In this case we consider RD90 (resource deprivation) + as an endogenous regressor. We use FP89 (families below poverty) + for this and hence put it in the instruments parameter, 'q'. + + >>> yd_var = ['RD90'] + >>> yd = np.array([db.by_col(name) for name in yd_var]).T + >>> q_var = ['FP89'] + >>> q = np.array([db.by_col(name) for name in q_var]).T + + And then we can run and explore the model analogously to the previous combo: + + >>> model = GMM_Error(y, x, yend=yd, q=q, w=w, add_wy=True, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_ds='NAT') + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 CONSTANT 5.44035 0.560476 9.706652 0.0 + 1 PS90 1.427042 0.1821 7.836572 0.0 + 2 UE90 -0.075224 0.050031 -1.503544 0.132699 + 3 RD90 3.316266 0.261269 12.692924 0.0 + 4 W_HR90 0.200314 0.057433 3.487777 0.000487 + 5 lambda 0.136933 0.070098 1.953457 0.050765 + + The class also allows for estimating a GNS model by adding spatial lags of the exogenous variables, using the argument slx_lags: + + >>> model = GMM_Error(y, x, w=w, add_wy=True, slx_lags=1, name_y=y_var, name_x=x_var, name_ds='NAT') + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 CONSTANT -0.554756 0.551765 -1.00542 0.314695 + 1 PS90 1.09369 0.225895 4.841583 0.000001 + 2 UE90 0.697393 0.082744 8.428291 0.0 + 3 W_PS90 -1.533378 0.396651 -3.865811 0.000111 + 4 W_UE90 -1.107944 0.33523 -3.305028 0.00095 + 5 W_HR90 1.529277 0.389354 3.927728 0.000086 + 6 lambda -0.917928 0.079569 -11.53625 0.0 + + + """ + + def __init__( + self, y, x, w, yend=None, q=None, estimator='het', add_wy=False, slx_lags=0, vm=False, name_y=None, name_x=None, name_w=None, name_yend=None, + name_q=None, name_ds=None, latex=False, **kwargs): + + if estimator == 'het': + if yend is None and not add_wy: + GM_Error_Het.__init__(self, y=y, x=x, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_w=name_w, name_ds=name_ds, latex=latex, **kwargs) + elif yend is not None and not add_wy: + GM_Endog_Error_Het.__init__(self, y=y, x=x, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_ds=name_ds, latex=latex, **kwargs) + elif add_wy: + GM_Combo_Het.__init__(self, y=y, x=x, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_ds=name_ds, latex=latex, **kwargs) + else: + set_warn(self, 'Combination of arguments passed to GMM_Error not allowed. Using default arguments instead.') + GM_Error_Het.__init__(self, y=y, x=x, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_w=name_w, name_ds=name_ds, latex=latex) + elif estimator == 'hom': + if yend is None and not add_wy: + GM_Error_Hom.__init__(self, y=y, x=x, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_w=name_w, name_ds=name_ds, latex=latex, **kwargs) + elif yend is not None and not add_wy: + GM_Endog_Error_Hom.__init__(self, y=y, x=x, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_ds=name_ds, latex=latex, **kwargs) + elif add_wy: + GM_Combo_Hom.__init__(self, y=y, x=x, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_ds=name_ds, latex=latex, **kwargs) + else: + set_warn(self, 'Combination of arguments passed to GMM_Error not allowed. Using default arguments instead.') + GM_Error_Hom.__init__(self, y=y, x=x, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_w=name_w, name_ds=name_ds, latex=latex) + elif estimator == 'kp98': + if yend is None and not add_wy: + GM_Error.__init__(self, y=y, x=x, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_w=name_w, name_ds=name_ds, latex=latex, **kwargs) + elif yend is not None and not add_wy: + GM_Endog_Error.__init__(self, y=y, x=x, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_ds=name_ds, latex=latex, **kwargs) + elif add_wy: + GM_Combo.__init__(self, y=y, x=x, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_ds=name_ds, latex=latex, **kwargs) + else: + set_warn(self, 'Combination of arguments passed to GMM_Error not allowed. Using default arguments instead.') + GM_Error.__init__(self, y=y, x=x, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_w=name_w, name_ds=name_ds, latex=latex) + else: + set_warn(self, 'Combination of arguments passed to GMM_Error not allowed. Using default arguments instead.') + GM_Error_Het.__init__(self, y=y, x=x, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + name_w=name_w, name_ds=name_ds, latex=latex) def _momentsGM_Error(w, u): @@ -1103,16 +1301,30 @@ def _test(): if __name__ == "__main__": _test() - import libpysal + import numpy as np + import libpysal - dbf = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r") - y = np.array([dbf.by_col("HOVAL")]).T - names_to_extract = ["INC", "CRIME"] - x = np.array([dbf.by_col(name) for name in names_to_extract]).T - w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), "r").read() - w.transform = "r" - model = GM_Error( - y, x, w, name_y="hoval", name_x=["income", "crime"], name_ds="columbus" - ) - print(model.summary) + db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') + y = np.array(db.by_col("HOVAL")) + y = np.reshape(y, (49,1)) + X = [] + X.append(db.by_col("INC")) + X = np.array(X).T + yd = [] + yd.append(db.by_col("CRIME")) + yd = np.array(yd).T + q = [] + q.append(db.by_col("DISCBD")) + q = np.array(q).T + + w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) + w.transform = 'r' + #reg = GM_Error(y, X, w=w, name_x=['inc'], name_y='hoval', name_ds='columbus', vm=True) + #reg = GM_Endog_Error(y, X, yd, q, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], + # name_q=['discbd'], name_ds='columbus',vm=True) + reg = GM_Combo(y, X, yd, q, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], + name_ds='columbus', vm=True) + + print(reg.output) + print(reg.summary) \ No newline at end of file diff --git a/spreg/error_sp_het.py b/spreg/error_sp_het.py index f40a7ae9..50b838fb 100755 --- a/spreg/error_sp_het.py +++ b/spreg/error_sp_het.py @@ -12,12 +12,13 @@ import numpy.linalg as la from . import ols as OLS from . import user_output as USER -from . import summary_output as SUMMARY from . import twosls as TSLS from . import utils as UTILS -from .utils import RegressionPropsY, spdot, set_endog, sphstack, set_warn +from .utils import RegressionPropsY, spdot, set_endog, sphstack, set_warn, get_lags from scipy import sparse as SP from libpysal.weights.spatial_lag import lag_spatial +import pandas as pd +from .output import output, _summary_iteration, _spat_pseudo_r2 __all__ = ["GM_Error_Het", "GM_Endog_Error_Het", "GM_Combo_Het"] @@ -170,6 +171,9 @@ class GM_Error_Het(BaseGM_Error_Het): independent (exogenous) variable, excluding the constant w : pysal W object Spatial weights object + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. max_iter : int Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`. Note: epsilon provides an additional @@ -191,9 +195,13 @@ class GM_Error_Het(BaseGM_Error_Het): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -336,6 +344,7 @@ def __init__( y, x, w, + slx_lags=0, max_iter=1, epsilon=0.00001, step1c=False, @@ -344,6 +353,7 @@ def __init__( name_x=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x) @@ -351,6 +361,12 @@ def __init__( USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) + self.title = "GM SPATIALLY WEIGHTED LEAST SQUARES (HET)" + if slx_lags >0: + lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (SDEM)" BaseGM_Error_Het.__init__( self, y, @@ -360,13 +376,16 @@ def __init__( step1c=step1c, epsilon=epsilon, ) - self.title = "SPATIALLY WEIGHTED LEAST SQUARES (HET)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x_constant) self.name_x.append("lambda") self.name_w = USER.set_name_w(name_w, w) - SUMMARY.GM_Error_Het(reg=self, w=w, vm=vm) + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * (len(self.name_x)-1) + ['lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class BaseGM_Endog_Error_Het(RegressionPropsY): @@ -582,6 +601,9 @@ class GM_Endog_Error_Het(BaseGM_Endog_Error_Het): this should not contain any variables from x) w : pysal W object Spatial weights object + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. max_iter : int Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`. Note: epsilon provides an additional @@ -611,9 +633,13 @@ class GM_Endog_Error_Het(BaseGM_Endog_Error_Het): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -788,6 +814,7 @@ def __init__( yend, q, w, + slx_lags=0, max_iter=1, epsilon=0.00001, step1c=False, @@ -799,6 +826,7 @@ def __init__( name_q=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x, yend, q) @@ -806,6 +834,12 @@ def __init__( USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) + self.title = "GM SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HET)" + if slx_lags > 0: + lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (SDEM)" BaseGM_Endog_Error_Het.__init__( self, y=y, @@ -818,7 +852,6 @@ def __init__( epsilon=epsilon, inv_method=inv_method, ) - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HET)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x_constant) @@ -828,7 +861,12 @@ def __init__( self.name_q = USER.set_name_q(name_q, q) self.name_h = USER.set_name_h(self.name_x, self.name_q) self.name_w = USER.set_name_w(name_w, w) - SUMMARY.GM_Endog_Error_Het(reg=self, w=w, vm=vm) + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * len(self.name_yend) + ['lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class BaseGM_Combo_Het(BaseGM_Endog_Error_Het): @@ -1026,6 +1064,10 @@ class GM_Combo_Het(BaseGM_Combo_Het): Orders of W to include as instruments for the spatially lagged dependent variable. For example, w_lags=1, then instruments are WX; if w_lags=2, then WX, WWX; and so on. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the General Nesting + Spatial Model (GNSM) type. lag_q : boolean If True, then include spatial lags of the additional instruments (q). @@ -1058,9 +1100,13 @@ class GM_Combo_Het(BaseGM_Combo_Het): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -1255,6 +1301,7 @@ def __init__( q=None, w=None, w_lags=1, + slx_lags=0, lag_q=True, max_iter=1, epsilon=0.00001, @@ -1267,6 +1314,7 @@ def __init__( name_q=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x, yend, q) @@ -1274,7 +1322,13 @@ def __init__( USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) - yend2, q2 = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q) + + if slx_lags == 0: + yend2, q2 = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q) + else: + yend2, q2, wx = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q, slx_lags) + x_constant = np.hstack((x_constant, wx)) + BaseGM_Combo_Het.__init__( self, y=y, @@ -1294,7 +1348,10 @@ def __init__( w, self.y, self.predy, yend2[:, -1].reshape(self.n, 1), self.rho ) UTILS.set_warn(self, warn) - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HET)" + self.title = "SPATIALLY WEIGHTED 2SLS- GM-COMBO MODEL (HET)" + if slx_lags > 0: + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (GNSM)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x_constant) @@ -1306,7 +1363,13 @@ def __init__( self.name_q.extend(USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q)) self.name_h = USER.set_name_h(self.name_x, self.name_q) self.name_w = USER.set_name_w(name_w, w) - SUMMARY.GM_Combo_Het(reg=self, w=w, vm=vm) + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * (len(self.name_yend)-1) + ['rho', 'lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _spat_pseudo_r2(self) + self.other_top += _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) # Functions @@ -1551,3 +1614,28 @@ def _test(): if __name__ == "__main__": _test() + import numpy as np + import libpysal + + db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') + y = np.array(db.by_col("HOVAL")) + y = np.reshape(y, (49,1)) + X = [] + X.append(db.by_col("INC")) + X = np.array(X).T + yd = [] + yd.append(db.by_col("CRIME")) + yd = np.array(yd).T + q = [] + q.append(db.by_col("DISCBD")) + q = np.array(q).T + + w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) + w.transform = 'r' + # reg = GM_Error_Het(y, X, w=w, name_x=['inc'], name_y='hoval', name_ds='columbus', vm=True) + # reg = GM_Endog_Error_Het(y, X, yd, q, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], + # name_q=['discbd'], name_ds='columbus',vm=True) + reg = GM_Combo_Het(y, X, yd, q, w=w, step1c=True, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], name_ds='columbus', + vm=True) + print(reg.output) + print(reg.summary) \ No newline at end of file diff --git a/spreg/error_sp_het_regimes.py b/spreg/error_sp_het_regimes.py index 9c5a4586..2f812d74 100644 --- a/spreg/error_sp_het_regimes.py +++ b/spreg/error_sp_het_regimes.py @@ -6,7 +6,6 @@ import numpy as np import multiprocessing as mp from . import user_output as USER -from . import summary_output as SUMMARY from . import utils as UTILS from . import regimes as REGI from .ols import BaseOLS @@ -22,10 +21,12 @@ get_vc_het_tsls, get_Omega_GS2SLS, ) -from .utils import RegressionPropsY, spdot, set_endog, sphstack, set_warn, sp_att +from .utils import RegressionPropsY, spdot, set_endog, sphstack, set_warn, sp_att, get_lags from scipy import sparse as SP from libpysal.weights.spatial_lag import lag_spatial from platform import system +import pandas as pd +from .output import output, _summary_iteration, _spat_pseudo_r2 class GM_Error_Het_Regimes(RegressionPropsY, REGI.Regimes_Frame): @@ -64,6 +65,9 @@ class GM_Error_Het_Regimes(RegressionPropsY, REGI.Regimes_Frame): If True, a separate regression is run for each regime. regime_lag_sep: boolean Always False, kept for consistency, ignored. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. max_iter : int Maximum number of iterations of steps 2a and 2b from Arraiz et al. Note: epsilon provides an additional stop condition. @@ -90,9 +94,13 @@ class GM_Error_Het_Regimes(RegressionPropsY, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -304,6 +312,7 @@ def __init__( cols2regi="all", regime_err_sep=False, regime_lag_sep=False, + slx_lags=0, cores=False, vm=False, name_y=None, @@ -311,6 +320,7 @@ def __init__( name_w=None, name_ds=None, name_regimes=None, + latex=False, ): n = USER.check_arrays(y, x) @@ -329,6 +339,12 @@ def __init__( x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + + if slx_lags >0: + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.name_x_r = USER.set_name_x(name_x, x_constant) cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x_constant) @@ -339,11 +355,12 @@ def __init__( if regime_err_sep == True: if set(cols2regi) == set([True]): - self._error_regimes_multi( + self._error_het_regimes_multi( y, x_constant, regimes, w, + slx_lags, cores, max_iter, epsilon, @@ -351,21 +368,23 @@ def __init__( cols2regi, vm, name_x, + latex, ) else: raise Exception( - "All coefficients must vary accross regimes if regime_err_sep = True." + "All coefficients must vary across regimes if regime_err_sep = True." ) else: x_constant = sphstack(np.ones((x_constant.shape[0], 1)), x_constant) name_x = USER.set_name_x(name_x, x_constant) - self.x, self.name_x = REGI.Regimes_Frame.__init__( + self.x, self.name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi=None, cols2regi=cols2regi, names=name_x, + rlist=True, ) ols = BaseOLS(y=y, x=self.x) self.k = ols.x.shape[1] @@ -412,16 +431,25 @@ def __init__( self.vm = get_vm_het(moments_i[0], lambda3, self, w.sparse, vc3) self.betas = np.vstack((ols_s.betas, lambda3)) self.e_filtered = self.u - lambda3 * lag_spatial(w, self.u) - self.title = "SPATIALLY WEIGHTED LEAST SQUARES (HET) - REGIMES" + if slx_lags == 0: + self.title = "GM SPATIALLY WEIGHTED MODEL (HET) - REGIMES" + else: + self.title = "GM SPATIALLY WEIGHTED MODEL + SLX (SDEM-HET) - REGIMES" + self.name_x.append("lambda") self.kf += 1 self.chow = REGI.Chow(self) self._cache = {} - - SUMMARY.GM_Error_Het(reg=self, w=w, vm=vm, regimes=True) - - def _error_regimes_multi( - self, y, x, regimes, w, cores, max_iter, epsilon, step1c, cols2regi, vm, name_x + self.output = pd.DataFrame(self.name_x, + columns=['var_names']) + self.output['var_type'] = ['x']*(len(self.name_x)-1)+['lambda'] + self.output['regime'] = x_rlist + ['_Global'] + self.output['equation'] = 0 + self.other_top = _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) + + def _error_het_regimes_multi( + self, y, x, regimes, w, slx_lags, cores, max_iter, epsilon, step1c, cols2regi, vm, name_x, latex ): regi_ids = dict( @@ -459,6 +487,7 @@ def _error_regimes_multi( name_x + ["lambda"], self.name_w, self.name_regimes, + slx_lags, ), ) else: @@ -477,6 +506,7 @@ def _error_regimes_multi( name_x + ["lambda"], self.name_w, self.name_regimes, + slx_lags, ) ) @@ -501,6 +531,7 @@ def _error_regimes_multi( results = {} self.name_y, self.name_x = [], [] counter = 0 + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -530,11 +561,17 @@ def _error_regimes_multi( regi_ids[r], ] = results[r].e_filtered self.name_y += results[r].name_y + results[r].name_x = [str(r) + '_lambda' if value == 'lambda' else value for value in results[r].name_x] self.name_x += results[r].name_x + results[r].other_top = _summary_iteration(results[r]) + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x, + 'var_type': ['x'] * (len(results[r].name_x)-1) + + ['lambda'], + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.chow = REGI.Chow(self) self.multi = results - SUMMARY.GM_Error_Het_multi(reg=self, multireg=self.multi, vm=vm, regimes=True) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class GM_Endog_Error_Het_Regimes(RegressionPropsY, REGI.Regimes_Frame): @@ -581,6 +618,9 @@ class GM_Endog_Error_Het_Regimes(RegressionPropsY, REGI.Regimes_Frame): If True, a separate regression is run for each regime. regime_lag_sep : boolean Always False, kept for consistency, ignored. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. max_iter : int Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`. Note: epsilon provides an additional stop condition. @@ -615,9 +655,13 @@ class GM_Endog_Error_Het_Regimes(RegressionPropsY, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -869,6 +913,7 @@ def __init__( cols2regi="all", regime_err_sep=False, regime_lag_sep=False, + slx_lags=0, inv_method="power_exp", cores=False, vm=False, @@ -881,6 +926,7 @@ def __init__( name_regimes=None, summ=True, add_lag=False, + latex=False, ): n = USER.check_arrays(y, x, yend, q) @@ -898,12 +944,16 @@ def __init__( set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + if slx_lags > 0: + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + if summ: name_yend = USER.set_name_yend(name_yend, yend) self.name_y = USER.set_name_y(name_y) name_q = USER.set_name_q(name_q, q) self.name_x_r = USER.set_name_x(name_x, x_constant) + name_yend - cols2regi = REGI.check_cols2regi( constant_regi, cols2regi, x_constant, yend=yend ) @@ -914,13 +964,14 @@ def __init__( if regime_err_sep == True: if set(cols2regi) == set([True]): - self._endog_error_regimes_multi( + self._endog_error_het_regimes_multi( y, x_constant, regimes, w, yend, q, + slx_lags, cores, max_iter, epsilon, @@ -932,10 +983,11 @@ def __init__( name_yend, name_q, add_lag, + latex, ) else: raise Exception( - "All coefficients must vary accross regimes if regime_err_sep = True." + "All coefficients must vary across regimes if regime_err_sep = True." ) else: x_constant = sphstack(np.ones((x_constant.shape[0], 1)), x_constant) @@ -943,15 +995,16 @@ def __init__( q, name_q = REGI.Regimes_Frame.__init__( self, q, regimes, constant_regi=None, cols2regi="all", names=name_q ) - x, name_x = REGI.Regimes_Frame.__init__( + x, name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi=None, cols2regi=cols2regi, names=name_x, + rlist=True, ) - yend2, name_yend = REGI.Regimes_Frame.__init__( + yend2, name_yend, yend_rlist = REGI.Regimes_Frame.__init__( self, yend, regimes, @@ -959,6 +1012,7 @@ def __init__( cols2regi=cols2regi, yend=True, names=name_yend, + rlist=True, ) # 1a. S2SLS --> \tilde{\delta} @@ -1047,13 +1101,20 @@ def __init__( self.kf += 1 self.chow = REGI.Chow(self) self._cache = {} + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = ['x']*len(self.name_x)+['yend']*len(self.name_yend)+['lambda'] + self.output['regime'] = x_rlist + yend_rlist + ['_Global'] + self.output['equation'] = 0 if summ: - self.title = ( - "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HET) - REGIMES" - ) - SUMMARY.GM_Endog_Error_Het(reg=self, w=w, vm=vm, regimes=True) - - def _endog_error_regimes_multi( + self.other_top = _summary_iteration(self) + if slx_lags == 0: + self.title = ("GM SPATIALLY WEIGHTED 2SLS (HET) - REGIMES") + else: + self.title = ("GM SPATIALLY WEIGHTED 2SLS + SLX (SDEM-HET) - REGIMES") + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) + + def _endog_error_het_regimes_multi( self, y, x, @@ -1061,6 +1122,7 @@ def _endog_error_regimes_multi( w, yend, q, + slx_lags, cores, max_iter, epsilon, @@ -1072,6 +1134,7 @@ def _endog_error_regimes_multi( name_yend, name_q, add_lag, + latex, ): regi_ids = dict( @@ -1120,6 +1183,7 @@ def _endog_error_regimes_multi( self.name_w, self.name_regimes, add_lag, + slx_lags, ), ) else: @@ -1144,6 +1208,7 @@ def _endog_error_regimes_multi( self.name_w, self.name_regimes, add_lag, + slx_lags, ) ) @@ -1174,6 +1239,7 @@ def _endog_error_regimes_multi( self.name_h, ) = ([], [], [], [], [], []) counter = 0 + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -1215,17 +1281,19 @@ def _endog_error_regimes_multi( self.e_pred[ regi_ids[r], ] = results[r].e_pred + results[r].other_top = _spat_pseudo_r2(results[r]) + v_type = ['x'] * len(results[r].name_x) + ['yend'] * (len(results[r].name_yend)-1) + ['rho','lambda'] + else: + results[r].other_top = "" + v_type = ['x'] * len(results[r].name_x) + ['yend'] * len(results[r].name_yend) + ['lambda'] + results[r].other_top += _summary_iteration(results[r]) + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_z, + 'var_type': v_type, + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.chow = REGI.Chow(self) self.multi = results - if add_lag != False: - SUMMARY.GM_Combo_Het_multi( - reg=self, multireg=self.multi, vm=vm, regimes=True - ) - else: - SUMMARY.GM_Endog_Error_Het_multi( - reg=self, multireg=self.multi, vm=vm, regimes=True - ) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class GM_Combo_Het_Regimes(GM_Endog_Error_Het_Regimes): @@ -1274,6 +1342,9 @@ class GM_Combo_Het_Regimes(GM_Endog_Error_Het_Regimes): If True, the spatial parameter for spatial lag is also computed according to different regimes. If False (default), the spatial parameter is fixed across regimes. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the GNSM type. w_lags : integer Orders of W to include as instruments for the spatially lagged dependent variable. For example, w_lags=1, then @@ -1315,9 +1386,13 @@ class GM_Combo_Het_Regimes(GM_Endog_Error_Het_Regimes): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -1595,6 +1670,7 @@ def __init__( yend=None, q=None, w=None, + slx_lags=0, w_lags=1, lag_q=True, max_iter=1, @@ -1614,8 +1690,14 @@ def __init__( name_w=None, name_ds=None, name_regimes=None, + latex=False, ): - + if regime_lag_sep and not regime_err_sep: + set_warn(self, "regime_err_sep set to True when regime_lag_sep=True.") + regime_err_sep = True + if regime_err_sep and not regime_lag_sep: + set_warn(self, "regime_err_sep set to False when regime_lag_sep=False.") + regime_err_sep = False n = USER.check_arrays(y, x) self.step1c = step1c y = USER.check_y(y, n) @@ -1623,16 +1705,27 @@ def __init__( x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) - self.name_x_r = USER.set_name_x(name_x, x_constant) self.name_y = USER.set_name_y(name_y) name_yend = USER.set_name_yend(name_yend, yend) name_q = USER.set_name_q(name_q, q) - name_q.extend(USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True)) - cols2regi = REGI.check_cols2regi( - constant_regi, cols2regi, x_constant, yend=yend, add_cons=False - ) + if regime_err_sep and any(col != True for col in cols2regi): + set_warn(self, "All coefficients must vary across regimes if regime_err_sep = True, so setting cols2regi = 'all'.") + cols2regi = "all" + + if slx_lags > 0: + yend2, q2, wx = set_endog(y, x_constant, w, yend, q, w_lags, lag_q, slx_lags) + x_constant = np.hstack((x_constant, wx)) + name_slx = USER.set_name_spatial_lags(name_x, slx_lags) + name_q.extend(USER.set_name_q_sp(name_slx[-len(name_x):], w_lags, name_q, lag_q, force_all=True)) + name_x += name_slx + cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x_constant[:, :-1], yend=yend2, add_cons=False) + else: + name_q.extend(USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True)) + yend2, q2 = yend, q + cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False) + self.regimes_set = REGI._get_regimes_set(regimes) self.regimes = regimes USER.check_regimes(self.regimes_set, n, x_constant.shape[1]) @@ -1640,27 +1733,25 @@ def __init__( self.regime_lag_sep = regime_lag_sep if regime_lag_sep == True: - if regime_err_sep == False: - raise Exception( - "For spatial combo models, if spatial lag is set by regimes (regime_lag_sep=True), spatial error must also be set by regimes (regime_err_sep=True)." - ) - add_lag = [w_lags, lag_q] + if slx_lags == 0: + add_lag = [w_lags, lag_q] + else: + cols2regi += [True] + add_lag = False else: - cols2regi += [False] add_lag = False - if regime_err_sep == True: - raise Exception( - "For spatial combo models, if spatial error is set by regimes (regime_err_sep=True), all coefficients including lambda (regime_lag_sep=True) must be set by regimes." - ) - yend, q = set_endog(y, x_constant, w, yend, q, w_lags, lag_q) + cols2regi += [False] + if slx_lags == 0: + yend2, q2 = set_endog(y, x_constant, w, yend2, q2, w_lags, lag_q) + name_yend.append(USER.set_name_yend_sp(self.name_y)) GM_Endog_Error_Het_Regimes.__init__( self, y=y, x=x_constant, - yend=yend, - q=q, + yend=yend2, + q=q2, regimes=regimes, w=w, constant_regi=constant_regi, @@ -1681,17 +1772,24 @@ def __init__( name_regimes=name_regimes, summ=False, add_lag=add_lag, + latex=latex, ) if regime_err_sep != True: self.rho = self.betas[-2] self.predy_e, self.e_pred, warn = UTILS.sp_att( - w, self.y, self.predy, yend[:, -1].reshape(self.n, 1), self.rho + w, self.y, self.predy, yend2[:, -1].reshape(self.n, 1), self.rho ) UTILS.set_warn(self, warn) self.regime_lag_sep = regime_lag_sep - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HET) - REGIMES" - SUMMARY.GM_Combo_Het(reg=self, w=w, vm=vm, regimes=True) + if slx_lags == 0: + self.title = "GM SPATIALLY WEIGHTED 2SLS-COMBO MODEL (HET) - REGIMES" + else: + self.title = "GM SPATIALLY WEIGHTED 2SLS-COMBO + SLX (GNSM-HET) - REGIMES" + self.output.iat[-2, self.output.columns.get_loc('var_type')] = 'rho' + self.other_top = _spat_pseudo_r2(self) + self.other_top += _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) def _work_error( @@ -1708,6 +1806,7 @@ def _work_error( name_x, name_w, name_regimes, + slx_lags, ): w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True) y_r = y[regi_ids[r]] @@ -1717,7 +1816,10 @@ def _work_error( ) set_warn(model, warn) model.w = w_r - model.title = "SPATIALLY WEIGHTED LEAST SQUARES ESTIMATION (HET) - REGIME %s" % r + if slx_lags == 0: + model.title = "GM SPATIALLY WEIGHTED LS (HET) - REGIME %s" % r + else: + model.title = "GM SPATIALLY WEIGHTED LS + SLX (SDEM-HET) - REGIME %s" % r model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) model.name_x = ["%s_%s" % (str(r), i) for i in name_x] @@ -1746,6 +1848,7 @@ def _work_endog_error( name_w, name_regimes, add_lag, + slx_lags, ): w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True) y_r = y[regi_ids[r]] @@ -1777,12 +1880,22 @@ def _work_endog_error( w_r, model.y, model.predy, model.yend[:, -1].reshape(model.n, 1), model.rho ) set_warn(model, warn) - model.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HET) - REGIME %s" % r + + if slx_lags == 0: + if add_lag != False: + model.title = "GM SPATIALLY WEIGHTED 2SLS-COMBO MODEL (HET)- REGIME %s" % r + else: + model.title = "GM SPATIALLY WEIGHTED 2SLS (HET) - REGIME %s" % r + else: + if add_lag != False: + model.title = "GM SPATIAL COMBO MODEL + SLX (GNSM-HET) - REGIME %s" % r + else: + model.title = "GM SPATIALLY WEIGHTED 2SLS + SLX (SDEM-HET) - REGIME %s" % r model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) model.name_x = ["%s_%s" % (str(r), i) for i in name_x] model.name_yend = ["%s_%s" % (str(r), i) for i in name_yend] - model.name_z = model.name_x + model.name_yend + ["lambda"] + model.name_z = model.name_x + model.name_yend + [str(r)+"lambda"] model.name_q = ["%s_%s" % (str(r), i) for i in name_q] model.name_h = model.name_x + model.name_q model.name_w = name_w @@ -1801,3 +1914,33 @@ def _test(): if __name__ == "__main__": _test() + + import numpy as np + import libpysal + + db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') + y = np.array(db.by_col("HOVAL")) + y = np.reshape(y, (49,1)) + X = [] + X.append(db.by_col("INC")) + X = np.array(X).T + yd = [] + yd.append(db.by_col("CRIME")) + yd = np.array(yd).T + q = [] + q.append(db.by_col("DISCBD")) + q = np.array(q).T + + r_var = 'NSA' + regimes = db.by_col(r_var) + + w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) + w.transform = 'r' + #reg = GM_Error_Het_Regimes(y, X, regimes, w=w, name_x=['inc'], name_y='hoval', name_ds='columbus', vm=True, + # regime_err_sep=True) + #reg = GM_Endog_Error_Het_Regimes(y, X, yd, q, regimes, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], + # name_q=['discbd'], name_ds='columbus',vm=True, regime_err_sep=True) + reg = GM_Combo_Het_Regimes(y, X, regimes, yd, q, w=w, step1c=True, name_x=['inc'], name_y='hoval', name_yend=['crime'], + name_q=['discbd'], name_ds='columbus', vm=True, regime_err_sep=False, regime_lag_sep=False) + print(reg.output) + print(reg.summary) \ No newline at end of file diff --git a/spreg/error_sp_hom.py b/spreg/error_sp_hom.py index 5a4d8042..6208b8af 100644 --- a/spreg/error_sp_hom.py +++ b/spreg/error_sp_hom.py @@ -10,14 +10,14 @@ import numpy as np from numpy import linalg as la from . import ols as OLS -from libpysal.weights.spatial_lag import lag_spatial -from .utils import power_expansion, set_endog, iter_msg, sp_att +from .utils import set_endog, iter_msg, sp_att from .utils import get_A1_hom, get_A2_hom, get_A1_het, optim_moments -from .utils import get_spFilter, get_lags, _moments2eqs +from .utils import get_spFilter, get_lags from .utils import spdot, RegressionPropsY, set_warn from . import twosls as TSLS from . import user_output as USER -from . import summary_output as SUMMARY +import pandas as pd +from .output import output, _spat_pseudo_r2, _summary_iteration __all__ = ["GM_Error_Hom", "GM_Endog_Error_Hom", "GM_Combo_Hom"] @@ -178,6 +178,9 @@ class GM_Error_Hom(BaseGM_Error_Hom): independent (exogenous) variable, excluding the constant w : pysal W object Spatial weights object + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. max_iter : int Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`. Note: epsilon provides an additional stop condition. @@ -201,10 +204,13 @@ class GM_Error_Hom(BaseGM_Error_Hom): Name of weights matrix for use in output name_ds : string Name of dataset for use in output - + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -341,6 +347,7 @@ def __init__( y, x, w, + slx_lags=0, max_iter=1, epsilon=0.00001, A1="hom_sc", @@ -349,6 +356,7 @@ def __init__( name_x=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x) @@ -356,6 +364,12 @@ def __init__( USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) + self.title = "GM SPATIALLY WEIGHTED LEAST SQUARES (HOM)" + if slx_lags >0: + lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (SDEM)" BaseGM_Error_Hom.__init__( self, y=y, @@ -365,13 +379,17 @@ def __init__( max_iter=max_iter, epsilon=epsilon, ) - self.title = "SPATIALLY WEIGHTED LEAST SQUARES (HOM)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x_constant) self.name_x.append("lambda") self.name_w = USER.set_name_w(name_w, w) - SUMMARY.GM_Error_Hom(reg=self, w=w, vm=vm) + self.A1 = A1 + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * (len(self.name_x) - 1) + ['lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class BaseGM_Endog_Error_Hom(RegressionPropsY): @@ -564,6 +582,9 @@ class GM_Endog_Error_Hom(BaseGM_Endog_Error_Hom): this should not contain any variables from x) w : pysal W object Spatial weights object + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. max_iter : int Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`. Note: epsilon provides an additional stop condition. @@ -591,9 +612,13 @@ class GM_Endog_Error_Hom(BaseGM_Endog_Error_Hom): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -769,6 +794,7 @@ def __init__( yend, q, w, + slx_lags=0, max_iter=1, epsilon=0.00001, A1="hom_sc", @@ -779,6 +805,7 @@ def __init__( name_q=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x, yend, q) @@ -786,6 +813,12 @@ def __init__( USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) + self.title = "GM SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HOM)" + if slx_lags > 0: + lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (SDEM)" BaseGM_Endog_Error_Hom.__init__( self, y=y, @@ -797,7 +830,6 @@ def __init__( max_iter=max_iter, epsilon=epsilon, ) - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HOM)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x_constant) @@ -807,7 +839,13 @@ def __init__( self.name_q = USER.set_name_q(name_q, q) self.name_h = USER.set_name_h(self.name_x, self.name_q) self.name_w = USER.set_name_w(name_w, w) - SUMMARY.GM_Endog_Error_Hom(reg=self, w=w, vm=vm) + self.A1 = A1 + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * len(self.name_yend) + ['lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class BaseGM_Combo_Hom(BaseGM_Endog_Error_Hom): @@ -1007,6 +1045,10 @@ class GM_Combo_Hom(BaseGM_Combo_Hom): Orders of W to include as instruments for the spatially lagged dependent variable. For example, w_lags=1, then instruments are WX; if w_lags=2, then WX, WWX; and so on. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the General Nesting + Spatial Model (GNSM) type. lag_q : boolean If True, then include spatial lags of the additional instruments (q). @@ -1038,9 +1080,13 @@ class GM_Combo_Hom(BaseGM_Combo_Hom): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -1232,6 +1278,7 @@ def __init__( q=None, w=None, w_lags=1, + slx_lags=0, lag_q=True, max_iter=1, epsilon=0.00001, @@ -1243,6 +1290,7 @@ def __init__( name_q=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x, yend, q) @@ -1250,7 +1298,11 @@ def __init__( USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) - yend2, q2 = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q) + if slx_lags == 0: + yend2, q2 = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q) + else: + yend2, q2, wx = set_endog(y, x_constant[:, 1:], w, yend, q, w_lags, lag_q, slx_lags) + x_constant = np.hstack((x_constant, wx)) BaseGM_Combo_Hom.__init__( self, y=y, @@ -1269,7 +1321,10 @@ def __init__( w, self.y, self.predy, yend2[:, -1].reshape(self.n, 1), self.rho ) set_warn(self, warn) - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HOM)" + self.title = "SPATIALLY WEIGHTED 2SLS- GM-COMBO MODEL (HOM)" + if slx_lags > 0: + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (GNSM)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x_constant) @@ -1281,8 +1336,14 @@ def __init__( self.name_q.extend(USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q)) self.name_h = USER.set_name_h(self.name_x, self.name_q) self.name_w = USER.set_name_w(name_w, w) - SUMMARY.GM_Combo_Hom(reg=self, w=w, vm=vm) - + self.A1 = A1 + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * (len(self.name_yend) - 1) + ['rho', 'lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _spat_pseudo_r2(self) + self.other_top += _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) # Functions @@ -1514,5 +1575,31 @@ def _test(): if __name__ == "__main__": - _test() + + import numpy as np + import libpysal + + db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') + y = np.array(db.by_col("HOVAL")) + y = np.reshape(y, (49,1)) + X = [] + X.append(db.by_col("INC")) + X = np.array(X).T + yd = [] + yd.append(db.by_col("CRIME")) + yd = np.array(yd).T + q = [] + q.append(db.by_col("DISCBD")) + q = np.array(q).T + + w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) + w.transform = 'r' + #reg = GM_Error_Hom(y, X, w=w, name_x=['inc'], name_y='hoval', name_ds='columbus', vm=True) + #reg = GM_Endog_Error_Hom(y, X, yd, q, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], + # name_q=['discbd'], name_ds='columbus',vm=True) + reg = GM_Combo_Hom(y, X, yd, q, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], name_q=['discbd'], + name_ds='columbus', vm=True) + + print(reg.output) + print(reg.summary) \ No newline at end of file diff --git a/spreg/error_sp_hom_regimes.py b/spreg/error_sp_hom_regimes.py index 9812584e..f8d7dfae 100644 --- a/spreg/error_sp_hom_regimes.py +++ b/spreg/error_sp_hom_regimes.py @@ -26,9 +26,9 @@ ) from . import regimes as REGI from . import user_output as USER -from . import summary_output as SUMMARY from platform import system - +import pandas as pd +from .output import output, _summary_iteration, _spat_pseudo_r2 class GM_Error_Hom_Regimes(RegressionPropsY, REGI.Regimes_Frame): @@ -67,6 +67,9 @@ class GM_Error_Hom_Regimes(RegressionPropsY, REGI.Regimes_Frame): If True, a separate regression is run for each regime. regime_lag_sep: boolean Always False, kept for consistency, ignored. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. max_iter : int Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`. Note: epsilon provides an additional @@ -97,9 +100,13 @@ class GM_Error_Hom_Regimes(RegressionPropsY, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -317,12 +324,14 @@ def __init__( cols2regi="all", regime_err_sep=False, regime_lag_sep=False, + slx_lags=0, vm=False, name_y=None, name_x=None, name_w=None, name_ds=None, name_regimes=None, + latex=False, ): n = USER.check_arrays(y, x) @@ -341,6 +350,12 @@ def __init__( x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + + if slx_lags >0: + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.name_x_r = USER.set_name_x(name_x, x_constant) cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x_constant) @@ -351,11 +366,12 @@ def __init__( if regime_err_sep == True: if set(cols2regi) == set([True]): - self._error_regimes_multi( + self._error_hom_regimes_multi( y, x_constant, regimes, w, + slx_lags, cores, max_iter, epsilon, @@ -363,10 +379,11 @@ def __init__( cols2regi, vm, name_x, + latex, ) else: raise Exception( - "All coefficients must vary accross regimes if regime_err_sep = True." + "All coefficients must vary across regimes if regime_err_sep = True." ) else: x_constant = sphstack(np.ones((x_constant.shape[0], 1)), x_constant) @@ -381,13 +398,14 @@ def __init__( wA2 = get_A2_hom(w.sparse) # 1a. OLS --> \tilde{\delta} - self.x, self.name_x = REGI.Regimes_Frame.__init__( + self.x, self.name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi=None, cols2regi=cols2regi, names=name_x, + rlist=True, ) ols = BaseOLS(y=y, x=self.x) self.k = ols.x.shape[1] @@ -425,16 +443,25 @@ def __init__( w.sparse, wA1, wA2, self, lambda2, moments[0] ) self.e_filtered = self.u - lambda2 * lag_spatial(w, self.u) - self.title = "SPATIALLY WEIGHTED LEAST SQUARES (HOM) - REGIMES" + if slx_lags == 0: + self.title = "GM SPATIALLY WEIGHTED MODEL (HOM) - REGIMES" + else: + self.title = "GM SPATIALLY WEIGHTED MODEL + SLX (SDEM-HOM) - REGIMES" self.name_x.append("lambda") self.kf += 1 self.chow = REGI.Chow(self) self._cache = {} - SUMMARY.GM_Error_Hom(reg=self, w=w, vm=vm, regimes=True) - - def _error_regimes_multi( - self, y, x, regimes, w, cores, max_iter, epsilon, A1, cols2regi, vm, name_x - ): + self.A1 = A1 + self.output = pd.DataFrame(self.name_x, + columns=['var_names']) + self.output['var_type'] = ['x']*(len(self.name_x)-1)+['lambda'] + self.output['regime'] = x_rlist + ['_Global'] + self.output['equation'] = 0 + self.other_top = _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) + + def _error_hom_regimes_multi( + self, y, x, regimes, w, slx_lags, cores, max_iter, epsilon, A1, cols2regi, vm, name_x, latex): regi_ids = dict( (r, list(np.where(np.array(regimes) == r)[0])) for r in self.regimes_set @@ -471,6 +498,7 @@ def _error_regimes_multi( name_x + ["lambda"], self.name_w, self.name_regimes, + slx_lags, ), ) else: @@ -489,6 +517,7 @@ def _error_regimes_multi( name_x + ["lambda"], self.name_w, self.name_regimes, + slx_lags, ) ) @@ -513,6 +542,7 @@ def _error_regimes_multi( results = {} counter = 0 + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -543,10 +573,16 @@ def _error_regimes_multi( ] = results[r].e_filtered self.name_y += results[r].name_y self.name_x += results[r].name_x + results[r].A1 = A1 + results[r].other_top = _summary_iteration(results[r]) + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x, + 'var_type': ['x'] * (len(results[r].name_x) - 1) + + ['lambda'], + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.chow = REGI.Chow(self) self.multi = results - SUMMARY.GM_Error_Hom_multi(reg=self, multireg=self.multi, vm=vm, regimes=True) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class GM_Endog_Error_Hom_Regimes(RegressionPropsY, REGI.Regimes_Frame): @@ -594,6 +630,9 @@ class GM_Endog_Error_Hom_Regimes(RegressionPropsY, REGI.Regimes_Frame): If True, a separate regression is run for each regime. regime_lag_sep : boolean Always False, kept for consistency, ignored. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. max_iter : int Maximum number of iterations of steps 2a and 2b from :cite:`Arraiz2010`. Note: epsilon provides an additional stop condition. @@ -624,9 +663,13 @@ class GM_Endog_Error_Hom_Regimes(RegressionPropsY, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -884,6 +927,7 @@ def __init__( cols2regi="all", regime_err_sep=False, regime_lag_sep=False, + slx_lags=0, max_iter=1, epsilon=0.00001, A1="het", @@ -898,6 +942,7 @@ def __init__( name_regimes=None, summ=True, add_lag=False, + latex=False, ): n = USER.check_arrays(y, x, yend, q) @@ -914,6 +959,10 @@ def __init__( x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + if slx_lags > 0: + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) if summ: name_yend = USER.set_name_yend(name_yend, yend) @@ -931,13 +980,14 @@ def __init__( if regime_err_sep == True: if set(cols2regi) == set([True]): - self._endog_error_regimes_multi( + self._endog_error_hom_regimes_multi( y, x_constant, regimes, w, yend, q, + slx_lags, cores, max_iter, epsilon, @@ -948,10 +998,11 @@ def __init__( name_yend, name_q, add_lag, + latex, ) else: raise Exception( - "All coefficients must vary accross regimes if regime_err_sep = True." + "All coefficients must vary across regimes if regime_err_sep = True." ) else: x_constant = sphstack(np.ones((x_constant.shape[0], 1)), x_constant) @@ -959,15 +1010,16 @@ def __init__( q, name_q = REGI.Regimes_Frame.__init__( self, q, regimes, constant_regi=None, cols2regi="all", names=name_q ) - x, name_x = REGI.Regimes_Frame.__init__( + x, name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi=None, cols2regi=cols2regi, names=name_x, + rlist=True, ) - yend2, name_yend = REGI.Regimes_Frame.__init__( + yend2, name_yend, yend_rlist = REGI.Regimes_Frame.__init__( self, yend, regimes, @@ -975,6 +1027,7 @@ def __init__( cols2regi=cols2regi, yend=True, names=name_yend, + rlist=True, ) if A1 == "hom": wA1 = get_A1_hom(w.sparse) @@ -1042,13 +1095,21 @@ def __init__( self.kf += 1 self.chow = REGI.Chow(self) self._cache = {} + self.A1 = A1 + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * len(self.name_yend) + ['lambda'] + self.output['regime'] = x_rlist + yend_rlist + ['_Global'] + self.output['equation'] = 0 if summ: - self.title = ( - "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HOM) - REGIMES" - ) - SUMMARY.GM_Endog_Error_Hom(reg=self, w=w, vm=vm, regimes=True) - - def _endog_error_regimes_multi( + self.other_top = _summary_iteration(self) + if slx_lags == 0: + self.title = ("GM SPATIALLY WEIGHTED 2SLS (HOM) - REGIMES") + else: + self.title = ("GM SPATIALLY WEIGHTED 2SLS WITH SLX (SDEM-HOM) - REGIMES") + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) + + def _endog_error_hom_regimes_multi( self, y, x, @@ -1056,6 +1117,7 @@ def _endog_error_regimes_multi( w, yend, q, + slx_lags, cores, max_iter, epsilon, @@ -1066,6 +1128,7 @@ def _endog_error_regimes_multi( name_yend, name_q, add_lag, + latex, ): regi_ids = dict( @@ -1113,6 +1176,7 @@ def _endog_error_regimes_multi( self.name_w, self.name_regimes, add_lag, + slx_lags, ), ) else: @@ -1136,6 +1200,7 @@ def _endog_error_regimes_multi( self.name_w, self.name_regimes, add_lag, + slx_lags, ) ) @@ -1166,6 +1231,7 @@ def _endog_error_regimes_multi( self.name_h, ) = ([], [], [], [], [], []) counter = 0 + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -1207,17 +1273,20 @@ def _endog_error_regimes_multi( self.e_pred[ regi_ids[r], ] = results[r].e_pred + results[r].other_top = _spat_pseudo_r2(results[r]) + v_type = ['x'] * len(results[r].name_x) + ['yend'] * (len(results[r].name_yend)-1) + ['rho','lambda'] + else: + results[r].other_top = "" + v_type = ['x'] * len(results[r].name_x) + ['yend'] * len(results[r].name_yend) + ['lambda'] + results[r].A1 = A1 + results[r].other_top += _summary_iteration(results[r]) + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_z, + 'var_type': v_type, + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.chow = REGI.Chow(self) self.multi = results - if add_lag != False: - SUMMARY.GM_Combo_Hom_multi( - reg=self, multireg=self.multi, vm=vm, regimes=True - ) - else: - SUMMARY.GM_Endog_Error_Hom_multi( - reg=self, multireg=self.multi, vm=vm, regimes=True - ) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class GM_Combo_Hom_Regimes(GM_Endog_Error_Hom_Regimes): @@ -1267,6 +1336,9 @@ class GM_Combo_Hom_Regimes(GM_Endog_Error_Hom_Regimes): If True, the spatial parameter for spatial lag is also computed according to different regimes. If False (default), the spatial parameter is fixed across regimes. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the GNSM type. w_lags : integer Orders of W to include as instruments for the spatially lagged dependent variable. For example, w_lags=1, then @@ -1307,9 +1379,13 @@ class GM_Combo_Hom_Regimes(GM_Endog_Error_Hom_Regimes): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -1593,6 +1669,7 @@ def __init__( yend=None, q=None, w=None, + slx_lags=0, w_lags=1, lag_q=True, cores=False, @@ -1611,22 +1688,42 @@ def __init__( name_w=None, name_ds=None, name_regimes=None, + latex=False, ): + if regime_lag_sep and not regime_err_sep: + set_warn(self, "regime_err_sep set to True when regime_lag_sep=True.") + regime_err_sep = True + if regime_err_sep and not regime_lag_sep: + set_warn(self, "regime_err_sep set to False when regime_lag_sep=False.") + regime_err_sep = False n = USER.check_arrays(y, x) y = USER.check_y(y, n) USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + self.name_y = USER.set_name_y(name_y) name_yend = USER.set_name_yend(name_yend, yend) name_q = USER.set_name_q(name_q, q) - name_q.extend(USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True)) - cols2regi = REGI.check_cols2regi( - constant_regi, cols2regi, x_constant, yend=yend, add_cons=False - ) + if regime_err_sep and any(col != True for col in cols2regi): + set_warn(self, "All coefficients must vary across regimes if regime_err_sep = True, so setting cols2regi = 'all'.") + cols2regi = "all" + + if slx_lags > 0: + yend2, q2, wx = set_endog(y, x_constant, w, yend, q, w_lags, lag_q, slx_lags) + x_constant = np.hstack((x_constant, wx)) + name_slx = USER.set_name_spatial_lags(name_x, slx_lags) + name_q.extend(USER.set_name_q_sp(name_slx[-len(name_x):], w_lags, name_q, lag_q, force_all=True)) + name_x += name_slx + cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x_constant[:, :-1], yend=yend2, add_cons=False) + else: + name_q.extend(USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True)) + yend2, q2 = yend, q + cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False) + self.regimes_set = REGI._get_regimes_set(regimes) self.regimes = regimes USER.check_regimes(self.regimes_set, n, x_constant.shape[1]) @@ -1634,27 +1731,26 @@ def __init__( self.regime_lag_sep = regime_lag_sep if regime_lag_sep == True: - if regime_err_sep == False: - raise Exception( - "For spatial combo models, if spatial lag is set by regimes (regime_lag_sep=True), spatial error must also be set by regimes (regime_err_sep=True)." - ) - add_lag = [w_lags, lag_q] + if slx_lags == 0: + add_lag = [w_lags, lag_q] + else: + add_lag = False + cols2regi += [True] else: - cols2regi += [False] add_lag = False - if regime_err_sep == True: - raise Exception( - "For spatial combo models, if spatial error is set by regimes (regime_err_sep=True), all coefficients including lambda (regime_lag_sep=True) must be set by regimes." - ) - yend, q = set_endog(y, x_constant, w, yend, q, w_lags, lag_q) + cols2regi += [False] + if slx_lags == 0: + yend2, q2 = set_endog(y, x_constant, w, yend2, q2, w_lags, lag_q) + + name_yend.append(USER.set_name_yend_sp(self.name_y)) GM_Endog_Error_Hom_Regimes.__init__( self, y=y, x=x_constant, - yend=yend, - q=q, + yend=yend2, + q=q2, regimes=regimes, w=w, vm=vm, @@ -1674,17 +1770,24 @@ def __init__( name_regimes=name_regimes, summ=False, add_lag=add_lag, + latex=latex, ) if regime_err_sep != True: self.rho = self.betas[-2] self.predy_e, self.e_pred, warn = sp_att( - w, self.y, self.predy, yend[:, -1].reshape(self.n, 1), self.rho + w, self.y, self.predy, yend2[:, -1].reshape(self.n, 1), self.rho ) set_warn(self, warn) self.regime_lag_sep = regime_lag_sep - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HOM) - REGIMES" - SUMMARY.GM_Combo_Hom(reg=self, w=w, vm=vm, regimes=True) + if slx_lags == 0: + self.title = "GM SPATIALLY WEIGHTED 2SLS-COMBO MODEL (HOM) - REGIMES" + else: + self.title = "GM SPATIALLY WEIGHTED 2SLS-COMBO WITH SLX (GNSM-HOM) - REGIMES" + self.output.iat[-2, self.output.columns.get_loc('var_type')] = 'rho' + self.other_top = _spat_pseudo_r2(self) + self.other_top += _summary_iteration(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) def _work_error( @@ -1701,6 +1804,7 @@ def _work_error( name_x, name_w, name_regimes, + slx_lags, ): w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True) y_r = y[regi_ids[r]] @@ -1710,7 +1814,10 @@ def _work_error( ) set_warn(model, warn) model.w = w_r - model.title = "SPATIALLY WEIGHTED LEAST SQUARES ESTIMATION (HOM) - REGIME %s" % r + if slx_lags == 0: + model.title = "GM SPATIALLY WEIGHTED 2SLS (HOM) - REGIME %s" % r + else: + model.title = "GM SPATIALLY WEIGHTED 2SLS + SLX (SDEM-HOM) - REGIME %s" % r model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) model.name_x = ["%s_%s" % (str(r), i) for i in name_x] @@ -1738,6 +1845,7 @@ def _work_endog_error( name_w, name_regimes, add_lag, + slx_lags, ): w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True) y_r = y[regi_ids[r]] @@ -1762,12 +1870,21 @@ def _work_endog_error( w_r, model.y, model.predy, model.yend[:, -1].reshape(model.n, 1), model.rho ) set_warn(model, warn) - model.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES (HOM) - REGIME %s" % r + if slx_lags == 0: + if add_lag != False: + model.title = "GM SPATIALLY WEIGHTED 2SLS-COMBO MODEL (HOM)- REGIME %s" % r + else: + model.title = "GM SPATIALLY WEIGHTED 2SLS (HOM) - REGIME %s" % r + else: + if add_lag != False: + model.title = "GM SPATIAL COMBO MODEL + SLX (GNSM-HOM) - REGIME %s" % r + else: + model.title = "GM SPATIALLY WEIGHTED 2SLS + SLX (SDEM-HOM) - REGIME %s" % r model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) model.name_x = ["%s_%s" % (str(r), i) for i in name_x] model.name_yend = ["%s_%s" % (str(r), i) for i in name_yend] - model.name_z = model.name_x + model.name_yend + ["lambda"] + model.name_z = model.name_x + model.name_yend + [str(r)+"_lambda"] model.name_q = ["%s_%s" % (str(r), i) for i in name_q] model.name_h = model.name_x + model.name_q model.name_w = name_w @@ -1786,3 +1903,33 @@ def _test(): if __name__ == "__main__": _test() + + import numpy as np + import libpysal + + db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r') + y = np.array(db.by_col("HOVAL")) + y = np.reshape(y, (49,1)) + X = [] + X.append(db.by_col("INC")) + X = np.array(X).T + yd = [] + yd.append(db.by_col("CRIME")) + yd = np.array(yd).T + q = [] + q.append(db.by_col("DISCBD")) + q = np.array(q).T + + r_var = 'NSA' + regimes = db.by_col(r_var) + + w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) + w.transform = 'r' + #reg = GM_Error_Hom_Regimes(y, X, regimes, w=w, name_x=['inc'], name_y='hoval', name_ds='columbus', vm=True, + # regime_err_sep=True) + #reg = GM_Endog_Error_Hom_Regimes(y, X, yd, q, regimes, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], + # name_q=['discbd'], name_ds='columbus',vm=True, regime_err_sep=True) + reg = GM_Combo_Hom_Regimes(y, X, regimes, yd, q, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], + name_q=['discbd'], name_ds='columbus', vm=True, regime_err_sep=False, regime_lag_sep=False) + print(reg.output) + print(reg.summary) \ No newline at end of file diff --git a/spreg/error_sp_regimes.py b/spreg/error_sp_regimes.py index b1c1feea..8c62c7d8 100644 --- a/spreg/error_sp_regimes.py +++ b/spreg/error_sp_regimes.py @@ -8,7 +8,6 @@ import multiprocessing as mp from . import regimes as REGI from . import user_output as USER -from . import summary_output as SUMMARY from libpysal.weights.spatial_lag import lag_spatial from .ols import BaseOLS from .twosls import BaseTSLS @@ -17,6 +16,10 @@ from .utils import optim_moments, get_spFilter, get_lags from .utils import spdot, RegressionPropsY from .sputils import sphstack +import pandas as pd +from .output import output, _spat_pseudo_r2 +from .error_sp_het_regimes import GM_Error_Het_Regimes, GM_Endog_Error_Het_Regimes, GM_Combo_Het_Regimes +from .error_sp_hom_regimes import GM_Error_Hom_Regimes, GM_Endog_Error_Hom_Regimes, GM_Combo_Hom_Regimes class GM_Error_Regimes(RegressionPropsY, REGI.Regimes_Frame): @@ -55,6 +58,9 @@ class GM_Error_Regimes(RegressionPropsY, REGI.Regimes_Frame): If True, a separate regression is run for each regime. regime_lag_sep: boolean Always False, kept for consistency, ignored. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. vm : boolean If True, include variance-covariance matrix in summary results @@ -72,10 +78,13 @@ class GM_Error_Regimes(RegressionPropsY, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output - + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -251,28 +260,15 @@ class GM_Error_Regimes(RegressionPropsY, REGI.Regimes_Frame): values in model.se_betas). Alternatively, we can have a summary of the output by typing: model.summary - >>> print(model.name_x) - ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', 'lambda'] - >>> np.around(model.betas, decimals=6) - array([[0.074807], - [0.786107], - [0.538849], - [5.103756], - [1.196009], - [0.600533], - [0.364103]]) - >>> np.around(model.std_err, decimals=6) - array([0.379864, 0.152316, 0.051942, 0.471285, 0.19867 , 0.057252]) - >>> np.around(model.z_stat, decimals=6) - array([[ 0.196932, 0.843881], - [ 5.161042, 0. ], - [10.37397 , 0. ], - [10.829455, 0. ], - [ 6.02007 , 0. ], - [10.489215, 0. ]]) - >>> np.around(model.sig2, decimals=6) - 28.172732 - + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 0_CONSTANT 0.074811 0.379864 0.196942 0.843873 + 1 0_PS90 0.786105 0.152315 5.161043 0.0 + 2 0_UE90 0.538848 0.051942 10.373969 0.0 + 3 1_CONSTANT 5.103761 0.471284 10.82949 0.0 + 4 1_PS90 1.196009 0.19867 6.020074 0.0 + 5 1_UE90 0.600532 0.057252 10.489217 0.0 + 6 lambda 0.3641 None None None """ def __init__( @@ -289,17 +285,26 @@ def __init__( cols2regi="all", regime_err_sep=False, regime_lag_sep=False, + slx_lags=0, cores=False, name_ds=None, name_regimes=None, + latex=False, ): n = USER.check_arrays(y, x) y = USER.check_y(y, n) USER.check_weights(w, y, w_required=True) - x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) + x_constant, name_x, warn = USER.check_constant( + x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + + if slx_lags >0: + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.name_x_r = USER.set_name_x(name_x, x_constant) self.constant_regi = constant_regi @@ -319,22 +324,24 @@ def __init__( if regime_err_sep == True: if set(cols2regi) == set([True]): self._error_regimes_multi( - y, x_constant, regimes, w, cores, cols2regi, vm, name_x + y, x_constant, regimes, w, slx_lags, cores, cols2regi, vm, name_x, latex ) else: raise Exception( - "All coefficients must vary accross regimes if regime_err_sep = True." + "All coefficients must vary across regimes if regime_err_sep = True." ) else: - x_constant = sphstack(np.ones((x_constant.shape[0], 1)), x_constant) + x_constant = sphstack( + np.ones((x_constant.shape[0], 1)), x_constant) name_x = USER.set_name_x(name_x, x_constant) - self.x, self.name_x = REGI.Regimes_Frame.__init__( + self.x, self.name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi=None, cols2regi=cols2regi, names=name_x, + rlist=True, ) ols = BaseOLS(y=y, x=self.x) self.k = ols.x.shape[1] @@ -354,14 +361,22 @@ def __init__( self.sig2 = ols2.sig2n self.e_filtered = self.u - lambda1 * lag_spatial(w, self.u) self.vm = self.sig2 * ols2.xtxi - self.title = "SPATIALLY WEIGHTED LEAST SQUARES - REGIMES" + if slx_lags == 0: + self.title = "GM SPATIALLY WEIGHTED MODEL - REGIMES" + else: + self.title = "GM SPATIALLY WEIGHTED MODEL + SLX (SDEM) - REGIMES" self.name_x.append("lambda") self.kf += 1 self.chow = REGI.Chow(self) self._cache = {} - SUMMARY.GM_Error(reg=self, w=w, vm=vm, regimes=True) - - def _error_regimes_multi(self, y, x, regimes, w, cores, cols2regi, vm, name_x): + self.output = pd.DataFrame(self.name_x, + columns=['var_names']) + self.output['var_type'] = ['x']*(len(self.name_x)-1)+['lambda'] + self.output['regime'] = x_rlist + ['_Global'] + self.output['equation'] = 0 + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) + + def _error_regimes_multi(self, y, x, regimes, w, slx_lags, cores, cols2regi, vm, name_x, latex): regi_ids = dict( (r, list(np.where(np.array(regimes) == r)[0])) for r in self.regimes_set ) @@ -394,6 +409,7 @@ def _error_regimes_multi(self, y, x, regimes, w, cores, cols2regi, vm, name_x): name_x + ["lambda"], self.name_w, self.name_regimes, + slx_lags, ), ) else: @@ -409,6 +425,7 @@ def _error_regimes_multi(self, y, x, regimes, w, cores, cols2regi, vm, name_x): name_x + ["lambda"], self.name_w, self.name_regimes, + slx_lags, ) ) @@ -432,6 +449,8 @@ def _error_regimes_multi(self, y, x, regimes, w, cores, cols2regi, vm, name_x): results = {} self.name_y, self.name_x = [], [] + self.output = pd.DataFrame( + columns=['var_names', 'var_type', 'regime', 'equation']) counter = 0 for r in self.regimes_set: """ @@ -446,11 +465,11 @@ def _error_regimes_multi(self, y, x, regimes, w, cores, cols2regi, vm, name_x): results[r] = results_p[r].get() self.vm[ - (counter * self.kr) : ((counter + 1) * self.kr), - (counter * self.kr) : ((counter + 1) * self.kr), + (counter * self.kr): ((counter + 1) * self.kr), + (counter * self.kr): ((counter + 1) * self.kr), ] = results[r].vm self.betas[ - (counter * (self.kr + 1)) : ((counter + 1) * (self.kr + 1)), + (counter * (self.kr + 1)): ((counter + 1) * (self.kr + 1)), ] = results[r].betas self.u[ regi_ids[r], @@ -463,10 +482,14 @@ def _error_regimes_multi(self, y, x, regimes, w, cores, cols2regi, vm, name_x): ] = results[r].e_filtered self.name_y += results[r].name_y self.name_x += results[r].name_x + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x, + 'var_type': ['x'] * (len(results[r].name_x) - 1) + + ['lambda'], + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.chow = REGI.Chow(self) self.multi = results - SUMMARY.GM_Error_multi(reg=self, multireg=self.multi, vm=vm, regimes=True) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class GM_Endog_Error_Regimes(RegressionPropsY, REGI.Regimes_Frame): @@ -513,6 +536,9 @@ class GM_Endog_Error_Regimes(RegressionPropsY, REGI.Regimes_Frame): If True, a separate regression is run for each regime. regime_lag_sep: boolean Always False, kept for consistency, ignored. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. vm : boolean If True, include variance-covariance matrix in summary results @@ -534,9 +560,13 @@ class GM_Endog_Error_Regimes(RegressionPropsY, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -742,22 +772,17 @@ class GM_Endog_Error_Regimes(RegressionPropsY, REGI.Regimes_Frame): endogenous variables included. Alternatively, we can have a summary of the output by typing: model.summary - >>> print(model.name_z) - ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', '0_RD90', '1_RD90', 'lambda'] - >>> np.around(model.betas, decimals=5) - array([[ 3.59718], - [ 1.0652 ], - [ 0.15822], - [ 9.19754], - [ 1.88082], - [-0.24878], - [ 2.46161], - [ 3.57943], - [ 0.25564]]) - >>> np.around(model.std_err, decimals=6) - array([0.522633, 0.137555, 0.063054, 0.473654, 0.18335 , 0.072786, - 0.300711, 0.240413]) - + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 0_CONSTANT 3.597178 0.522633 6.882796 0.0 + 1 0_PS90 1.065203 0.137555 7.743852 0.0 + 2 0_UE90 0.15822 0.063054 2.509282 0.012098 + 6 0_RD90 2.461609 0.300711 8.185967 0.0 + 3 1_CONSTANT 9.197542 0.473654 19.418268 0.0 + 4 1_PS90 1.880815 0.18335 10.258046 0.0 + 5 1_UE90 -0.248777 0.072786 -3.417919 0.000631 + 7 1_RD90 3.579429 0.240413 14.888666 0.0 + 8 lambda 0.255639 None None None """ def __init__( @@ -774,6 +799,7 @@ def __init__( cols2regi="all", regime_err_sep=False, regime_lag_sep=False, + slx_lags=0, name_y=None, name_x=None, name_yend=None, @@ -783,14 +809,22 @@ def __init__( name_regimes=None, summ=True, add_lag=False, + latex=False, ): n = USER.check_arrays(y, x, yend, q) y = USER.check_y(y, n) USER.check_weights(w, y, w_required=True) - x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) + x_constant, name_x, warn = USER.check_constant( + x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + + if slx_lags > 0: + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.constant_regi = constant_regi self.cols2regi = cols2regi self.name_ds = USER.set_name_ds(name_ds) @@ -822,6 +856,7 @@ def __init__( w, yend, q, + slx_lags, cores, cols2regi, vm, @@ -829,26 +864,29 @@ def __init__( name_yend, name_q, add_lag, + latex, ) else: raise Exception( - "All coefficients must vary accross regimes if regime_err_sep = True." + "All coefficients must vary across regimes if regime_err_sep = True." ) else: - x_constant = sphstack(np.ones((x_constant.shape[0], 1)), x_constant) + x_constant = sphstack( + np.ones((x_constant.shape[0], 1)), x_constant) name_x = USER.set_name_x(name_x, x_constant) q, name_q = REGI.Regimes_Frame.__init__( self, q, regimes, constant_regi=None, cols2regi="all", names=name_q ) - x, name_x = REGI.Regimes_Frame.__init__( + x, name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi=None, cols2regi=cols2regi, names=name_x, + rlist=True, ) - yend2, name_yend = REGI.Regimes_Frame.__init__( + yend2, name_yend, yend_rlist = REGI.Regimes_Frame.__init__( self, yend, regimes, @@ -856,6 +894,7 @@ def __init__( cols2regi=cols2regi, yend=True, names=name_yend, + rlist=True, ) tsls = BaseTSLS(y=y, x=x, yend=yend2, q=q) @@ -896,9 +935,19 @@ def __init__( self.kf += 1 self.chow = REGI.Chow(self) self._cache = {} + self.output = pd.DataFrame(self.name_z, + columns=['var_names']) + self.output['var_type'] = [ + 'x'] * len(self.name_x) + ['yend'] * len(self.name_yend) + ['lambda'] + self.output['regime'] = x_rlist + yend_rlist + ['_Global'] + self.output['equation'] = 0 if summ: - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES - REGIMES" - SUMMARY.GM_Endog_Error(reg=self, w=w, vm=vm, regimes=True) + if slx_lags == 0: + self.title = ("GM SPATIALLY WEIGHTED 2SLS - REGIMES") + else: + self.title = ("GM SPATIALLY WEIGHTED 2SLS WITH SLX (SDEM) - REGIMES") + output(reg=self, vm=vm, robust=False, + other_end=False, latex=latex) def _endog_error_regimes_multi( self, @@ -908,6 +957,7 @@ def _endog_error_regimes_multi( w, yend, q, + slx_lags, cores, cols2regi, vm, @@ -915,6 +965,7 @@ def _endog_error_regimes_multi( name_yend, name_q, add_lag, + latex, ): regi_ids = dict( @@ -959,6 +1010,7 @@ def _endog_error_regimes_multi( self.name_w, self.name_regimes, add_lag, + slx_lags, ), ) else: @@ -979,6 +1031,7 @@ def _endog_error_regimes_multi( self.name_w, self.name_regimes, add_lag, + slx_lags, ) ) @@ -1009,6 +1062,8 @@ def _endog_error_regimes_multi( self.name_h, ) = ([], [], [], [], [], []) counter = 0 + self.output = pd.DataFrame( + columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -1022,11 +1077,11 @@ def _endog_error_regimes_multi( results[r] = results_p[r].get() self.vm[ - (counter * self.kr) : ((counter + 1) * self.kr), - (counter * self.kr) : ((counter + 1) * self.kr), + (counter * self.kr): ((counter + 1) * self.kr), + (counter * self.kr): ((counter + 1) * self.kr), ] = results[r].vm self.betas[ - (counter * (self.kr + 1)) : ((counter + 1) * (self.kr + 1)), + (counter * (self.kr + 1)): ((counter + 1) * (self.kr + 1)), ] = results[r].betas self.u[ regi_ids[r], @@ -1050,15 +1105,20 @@ def _endog_error_regimes_multi( self.e_pred[ regi_ids[r], ] = results[r].e_pred + results[r].other_top = _spat_pseudo_r2(results[r]) + v_type = ['x'] * len(results[r].name_x) + ['yend'] * \ + (len(results[r].name_yend) - 1) + ['rho', 'lambda'] + else: + results[r].other_top = "" + v_type = ['x'] * len(results[r].name_x) + ['yend'] * \ + len(results[r].name_yend) + ['lambda'] + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_z, + 'var_type': v_type, + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.chow = REGI.Chow(self) self.multi = results - if add_lag != False: - SUMMARY.GM_Combo_multi(reg=self, multireg=self.multi, vm=vm, regimes=True) - else: - SUMMARY.GM_Endog_Error_multi( - reg=self, multireg=self.multi, vm=vm, regimes=True - ) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) class GM_Combo_Regimes(GM_Endog_Error_Regimes, REGI.Regimes_Frame): @@ -1107,6 +1167,9 @@ class GM_Combo_Regimes(GM_Endog_Error_Regimes, REGI.Regimes_Frame): If True, the spatial parameter for spatial lag is also computed according to different regimes. If False (default), the spatial parameter is fixed accross regimes. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the GNSM type. w_lags : integer Orders of W to include as instruments for the spatially lagged dependent variable. For example, w_lags=1, then @@ -1135,9 +1198,13 @@ class GM_Combo_Regimes(GM_Endog_Error_Regimes, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -1350,22 +1417,16 @@ class GM_Combo_Regimes(GM_Endog_Error_Regimes, REGI.Regimes_Frame): output by typing: model.summary Alternatively, we can check the betas: - >>> print(model.name_z) - ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', '_Global_W_HR90', 'lambda'] - >>> print(np.around(model.betas,4)) - [[ 1.4607] - [ 0.958 ] - [ 0.5658] - [ 9.113 ] - [ 1.1338] - [ 0.6517] - [-0.4583] - [ 0.6136]] - - And lambda: - - >>> print('lambda: ', np.around(model.betas[-1], 4)) - lambda: [0.6136] + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 0_CONSTANT 1.460707 0.704174 2.074356 0.038046 + 1 0_PS90 0.95795 0.171485 5.586214 0.0 + 2 0_UE90 0.565805 0.053665 10.543203 0.0 + 3 1_CONSTANT 9.112998 1.525875 5.972311 0.0 + 4 1_PS90 1.13382 0.20552 5.51683 0.0 + 5 1_UE90 0.65169 0.061106 10.664938 0.0 + 6 _Global_W_HR90 -0.458326 0.145599 -3.147859 0.001645 + 7 lambda 0.613599 None None None This class also allows the user to run a spatial lag+error model with the extra feature of including non-spatial endogenous regressors. This means @@ -1383,24 +1444,18 @@ class GM_Combo_Regimes(GM_Endog_Error_Regimes, REGI.Regimes_Frame): And then we can run and explore the model analogously to the previous combo: >>> model = GM_Combo_Regimes(y, x, regimes, yd, q, w=w, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT') - >>> print(model.name_z) - ['0_CONSTANT', '0_PS90', '0_UE90', '1_CONSTANT', '1_PS90', '1_UE90', '0_RD90', '1_RD90', '_Global_W_HR90', 'lambda'] - >>> print(model.betas) - [[ 3.41963782] - [ 1.04065841] - [ 0.16634393] - [ 8.86544628] - [ 1.85120528] - [-0.24908469] - [ 2.43014046] - [ 3.61645481] - [ 0.03308671] - [ 0.18684992]] - >>> print(np.sqrt(model.vm.diagonal())) - [0.53067577 0.13271426 0.06058025 0.76406411 0.17969783 0.07167421 - 0.28943121 0.25308326 0.06126529] - >>> print('lambda: ', np.around(model.betas[-1], 4)) - lambda: [0.1868] + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 0_CONSTANT 3.419638 0.530676 6.443931 0.0 + 1 0_PS90 1.040658 0.132714 7.841346 0.0 + 2 0_UE90 0.166344 0.06058 2.745844 0.006036 + 6 0_RD90 2.43014 0.289431 8.396263 0.0 + 3 1_CONSTANT 8.865446 0.764064 11.603014 0.0 + 4 1_PS90 1.851205 0.179698 10.301769 0.0 + 5 1_UE90 -0.249085 0.071674 -3.475235 0.00051 + 7 1_RD90 3.616455 0.253083 14.289586 0.0 + 8 _Global_W_HR90 0.033087 0.061265 0.540057 0.589158 + 9 lambda 0.18685 None None None """ def __init__( @@ -1411,6 +1466,7 @@ def __init__( yend=None, q=None, w=None, + slx_lags=0, w_lags=1, lag_q=True, cores=False, @@ -1426,22 +1482,41 @@ def __init__( name_w=None, name_ds=None, name_regimes=None, + latex=False, ): - + if regime_lag_sep and not regime_err_sep: + set_warn(self, "regime_err_sep set to True when regime_lag_sep=True.") + regime_err_sep = True + if regime_err_sep and not regime_lag_sep: + set_warn(self, "regime_err_sep set to False when regime_lag_sep=False.") + regime_err_sep = False n = USER.check_arrays(y, x) y = USER.check_y(y, n) USER.check_weights(w, y, w_required=True) - x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) + x_constant, name_x, warn = USER.check_constant( + x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) self.name_y = USER.set_name_y(name_y) name_yend = USER.set_name_yend(name_yend, yend) name_q = USER.set_name_q(name_q, q) - name_q.extend(USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True)) - cols2regi = REGI.check_cols2regi( - constant_regi, cols2regi, x_constant, yend=yend, add_cons=False - ) + if regime_err_sep and any(col != True for col in cols2regi): + set_warn(self, "All coefficients must vary across regimes if regime_err_sep = True, so setting cols2regi = 'all'.") + cols2regi = "all" + + if slx_lags > 0: + yend2, q2, wx = set_endog(y, x_constant, w, yend, q, w_lags, lag_q, slx_lags) + x_constant = np.hstack((x_constant, wx)) + name_slx = USER.set_name_spatial_lags(name_x, slx_lags) + name_q.extend(USER.set_name_q_sp(name_slx[-len(name_x):], w_lags, name_q, lag_q, force_all=True)) + name_x += name_slx + cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x_constant[:, :-1], yend=yend2, add_cons=False) + else: + name_q.extend(USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True)) + yend2, q2 = yend, q + cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False) + self.regimes_set = REGI._get_regimes_set(regimes) self.regimes = regimes USER.check_regimes(self.regimes_set, n, x_constant.shape[1]) @@ -1449,27 +1524,27 @@ def __init__( self.regime_lag_sep = regime_lag_sep if regime_lag_sep == True: - if regime_err_sep == False: - raise Exception( - "For spatial combo models, if spatial lag is set by regimes (regime_lag_sep=True), spatial error must also be set by regimes (regime_err_sep=True)." - ) - add_lag = [w_lags, lag_q] + if slx_lags == 0: + add_lag = [w_lags, lag_q] + else: + add_lag = False + cols2regi += [True] + else: - if regime_err_sep == True: - raise Exception( - "For spatial combo models, if spatial error is set by regimes (regime_err_sep=True), all coefficients including lambda (regime_lag_sep=True) must be set by regimes." - ) - cols2regi += [False] add_lag = False - yend, q = set_endog(y, x_constant, w, yend, q, w_lags, lag_q) + cols2regi += [False] + if slx_lags == 0: + yend2, q2 = set_endog(y, x_constant, w, yend2, q2, w_lags, lag_q) + name_yend.append(USER.set_name_yend_sp(self.name_y)) + print(cols2regi, x_constant.shape[1], yend2.shape[1], name_x, name_yend, name_q) GM_Endog_Error_Regimes.__init__( self, y=y, x=x_constant, - yend=yend, - q=q, + yend=yend2, + q=q2, regimes=regimes, w=w, vm=vm, @@ -1486,17 +1561,399 @@ def __init__( name_regimes=name_regimes, summ=False, add_lag=add_lag, + latex=latex, ) if regime_err_sep != True: self.rho = self.betas[-2] self.predy_e, self.e_pred, warn = sp_att( - w, self.y, self.predy, yend[:, -1].reshape(self.n, 1), self.rho + w, self.y, self.predy, yend2[:, -1].reshape(self.n, 1), self.rho ) set_warn(self, warn) - self.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES - REGIMES" - SUMMARY.GM_Combo(reg=self, w=w, vm=vm, regimes=True) + if slx_lags == 0: + self.title = "SPATIALLY WEIGHTED 2SLS - GM-COMBO MODEL - REGIMES" + else: + self.title = "SPATIALLY WEIGHTED 2SLS - GM-COMBO WITH SLX (GNSM) - REGIMES" + self.output.iat[-2, + self.output.columns.get_loc('var_type')] = 'rho' + self.other_top = _spat_pseudo_r2(self) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) + + +class GMM_Error_Regimes(GM_Error_Regimes, GM_Combo_Regimes, GM_Endog_Error_Regimes, + GM_Error_Het_Regimes, GM_Combo_Het_Regimes, GM_Endog_Error_Het_Regimes, + GM_Error_Hom_Regimes, GM_Combo_Hom_Regimes, GM_Endog_Error_Hom_Regimes + ): + + """ + Wrapper function to call any of the GM methods for a spatial error regimes model available in spreg + + Parameters + ---------- + y : array + nx1 array for dependent variable + x : array + Two dimensional array with n rows and one column for each + independent (exogenous) variable, excluding the constant + regimes : list + List of n values with the mapping of each + observation to a regime. Assumed to be aligned with 'x'. + w : pysal W object + Spatial weights object (always needed) + yend : array + Two dimensional array with n rows and one column for each + endogenous variable (if any) + q : array + Two dimensional array with n rows and one column for each + external exogenous variable to use as instruments (if any) + (note: this should not contain any variables from x) + estimator : string + Choice of estimator to be used. Options are: 'het', which + is robust to heteroskedasticity, 'hom', which assumes + homoskedasticity, and 'kp98', which does not provide + inference on the spatial parameter for the error term. + constant_regi: string, optional + Switcher controlling the constant term setup. It may take + the following values: + + * 'one': a vector of ones is appended to x and held constant across regimes. + + * 'many': a vector of ones is appended to x and considered different per regime (default). + cols2regi : list, 'all' + Argument indicating whether each + column of x should be considered as different per regime + or held constant across regimes (False). + If a list, k booleans indicating for each variable the + option (True if one per regime, False to be held constant). + If 'all' (default), all the variables vary by regime. + regime_err_sep: boolean + If True, a separate regression is run for each regime. + regime_lag_sep: boolean + Always False, kept for consistency, ignored. + add_wy : boolean + If True, then a spatial lag of the dependent variable is included. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM or GNSM type. + vm : boolean + If True, include variance-covariance matrix in summary + results + name_y : string + Name of dependent variable for use in output + name_x : list of strings + Names of independent variables for use in output + name_w : string + Name of weights matrix for use in output + name_regimes : string + Name of regime variable for use in the output + name_yend : list of strings + Names of endogenous variables for use in output + name_q : list of strings + Names of instruments for use in output + name_ds : string + Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format + **kwargs : keywords + Additional arguments to pass on to the estimators. + See the specific functions for details on what can be used. + + Attributes + ---------- + output : dataframe + regression results pandas dataframe + summary : string + Summary of regression results and diagnostics (note: use in + conjunction with the print command) + betas : array + kx1 array of estimated coefficients + u : array + nx1 array of residuals + e_filtered : array + nx1 array of spatially filtered residuals + predy : array + nx1 array of predicted y values + n : integer + Number of observations + k : integer + Number of variables for which coefficients are estimated + (including the constant) + y : array + nx1 array for dependent variable + x : array + Two dimensional array with n rows and one column for each + independent (exogenous) variable, including the constant + mean_y : float + Mean of dependent variable + std_y : float + Standard deviation of dependent variable + pr2 : float + Pseudo R squared (squared correlation between y and ypred) + vm : array + Variance covariance matrix (kxk) + sig2 : float + Sigma squared used in computations + std_err : array + 1xk array of standard errors of the betas + z_stat : list of tuples + z statistic; each tuple contains the pair (statistic, + p-value), where each is a float + name_y : string + Name of dependent variable for use in output + name_x : list of strings + Names of independent variables for use in output + name_w : string + Name of weights matrix for use in output + name_ds : string + Name of dataset for use in output + name_regimes : string + Name of regime variable for use in the output + title : string + Name of the regression method used + Only available in dictionary 'multi' when multiple regressions + (see 'multi' below for details) + regimes : list + List of n values with the mapping of each + observation to a regime. Assumed to be aligned with 'x'. + constant_regi: string + Ignored if regimes=False. Constant option for regimes. + Switcher controlling the constant term setup. It may take + the following values: + * 'one': a vector of ones is appended to x and held constant across regimes + + * 'many': a vector of ones is appended to x and considered different per regime + cols2regi : list, 'all' + Ignored if regimes=False. Argument indicating whether each + column of x should be considered as different per regime + or held constant across regimes (False). + If a list, k booleans indicating for each variable the + option (True if one per regime, False to be held constant). + If 'all', all the variables vary by regime. + regime_err_sep: boolean + If True, a separate regression is run for each regime. + kr : int + Number of variables/columns to be "regimized" or subject + to change by regime. These will result in one parameter + estimate by regime for each variable (i.e. nr parameters per + variable) + kf : int + Number of variables/columns to be considered fixed or + global across regimes and hence only obtain one parameter + estimate + nr : int + Number of different regimes in the 'regimes' list + name_yend : list of strings (optional) + Names of endogenous variables for use in output + name_z : list of strings (optional) + Names of exogenous and endogenous variables for use in + output + name_q : list of strings (optional) + Names of external instruments + name_h : list of strings (optional) + Names of all instruments used in ouput + multi : dictionary + Only available when multiple regressions are estimated, + i.e. when regime_err_sep=True and no variable is fixed + across regimes. + Contains all attributes of each individual regression + + Examples + -------- + + We first need to import the needed modules, namely numpy to convert the + data we read into arrays that ``spreg`` understands and ``libpysal`` to + handle the weights and file management. + + >>> import numpy as np + >>> import libpysal + >>> from libpysal.examples import load_example + + Open data on NCOVR US County Homicides (3085 areas) using libpysal.io.open(). + This is the DBF associated with the NAT shapefile. Note that + libpysal.io.open() also reads data in CSV format; since the actual class + requires data to be passed in as numpy arrays, the user can read their + data in using any method. + + >>> nat = load_example('Natregimes') + >>> db = libpysal.io.open(nat.get_path("natregimes.dbf"),'r') + + Extract the HR90 column (homicide rates in 1990) from the DBF file and make it the + dependent variable for the regression. Note that PySAL requires this to be + an numpy array of shape (n, 1) as opposed to the also common shape of (n, ) + that other packages accept. + + >>> y_var = 'HR90' + >>> y = np.array([db.by_col(y_var)]).reshape(3085,1) + + Extract UE90 (unemployment rate) and PS90 (population structure) vectors from + the DBF to be used as independent variables in the regression. Other variables + can be inserted by adding their names to x_var, such as x_var = ['Var1','Var2','...] + Note that PySAL requires this to be an nxj numpy array, where j is the + number of independent variables (not including a constant). By default + this model adds a vector of ones to the independent variables passed in. + + >>> x_var = ['PS90','UE90'] + >>> x = np.array([db.by_col(name) for name in x_var]).T + + The different regimes in this data are given according to the North and + South dummy (SOUTH). + + >>> r_var = 'SOUTH' + >>> regimes = db.by_col(r_var) + + Since we want to run a spatial error model, we need to specify + the spatial weights matrix that includes the spatial configuration of the + observations. To do that, we can open an already existing gal file or + create a new one. In this case, we will create one from ``NAT.shp``. + + >>> w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp")) + + Unless there is a good reason not to do it, the weights have to be + row-standardized so every row of the matrix sums to one. Among other + things, this allows to interpret the spatial lag of a variable as the + average value of the neighboring observations. In PySAL, this can be + easily performed in the following way: + + >>> w.transform = 'r' + + The GMM_Error_Regimes class can run error models and SARAR models, that is a spatial lag+error model. + In this example we will run a simple version of the latter, where we have the + spatial effects as well as exogenous variables. Since it is a spatial + model, we have to pass in the weights matrix. If we want to + have the names of the variables printed in the output summary, we will + have to pass them in as well, although this is optional. + + >>> from spreg import GMM_Error_Regimes + >>> model = GMM_Error_Regimes(y, x, regimes, w=w, add_wy=True, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT') + + Once we have run the model, we can explore a little bit the output. The + regression object we have created has many attributes so take your time to + discover them. + + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 0_CONSTANT 1.461317 0.848361 1.722517 0.084976 + 1 0_PS90 0.958711 0.239834 3.997388 0.000064 + 2 0_UE90 0.565825 0.063726 8.879088 0.0 + 3 1_CONSTANT 9.115738 1.976874 4.611189 0.000004 + 4 1_PS90 1.132419 0.334107 3.389387 0.0007 + 5 1_UE90 0.651804 0.105518 6.177197 0.0 + 6 _Global_W_HR90 -0.458677 0.180997 -2.534173 0.011271 + 7 lambda 0.734354 0.035255 20.829823 0.0 + + This class also allows the user to run a spatial lag+error model with the + extra feature of including non-spatial endogenous regressors. This means + that, in addition to the spatial lag and error, we consider some of the + variables on the right-hand side of the equation as endogenous and we + instrument for this. In this case we consider RD90 (resource deprivation) + as an endogenous regressor. We use FP89 (families below poverty) + for this and hence put it in the instruments parameter, 'q'. + + >>> yd_var = ['RD90'] + >>> yd = np.array([db.by_col(name) for name in yd_var]).T + >>> q_var = ['FP89'] + >>> q = np.array([db.by_col(name) for name in q_var]).T + + And then we can run and explore the model analogously to the previous combo: + + >>> model = GMM_Error_Regimes(y, x, regimes, yend=yd, q=q, w=w, add_wy=True, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, name_ds='NAT') + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 0_CONSTANT 1.461317 0.848361 1.722517 0.084976 + 1 0_PS90 0.958711 0.239834 3.997388 0.000064 + 2 0_UE90 0.565825 0.063726 8.879088 0.0 + 3 1_CONSTANT 9.115738 1.976874 4.611189 0.000004 + 4 1_PS90 1.132419 0.334107 3.389387 0.0007 + 5 1_UE90 0.651804 0.105518 6.177197 0.0 + 6 _Global_W_HR90 -0.458677 0.180997 -2.534173 0.011271 + 7 lambda 0.734354 0.035255 20.829823 0.0 + + The class also allows for estimating a GNS model by adding spatial lags of the exogenous variables, using the argument slx_lags: + + >>> model = GMM_Error_Regimes(y, x, regimes, w=w, add_wy=True, slx_lags=1, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT') + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 0_CONSTANT 0.192699 0.256922 0.75003 0.453237 + 1 0_PS90 1.098019 0.232054 4.731743 0.000002 + 2 0_UE90 0.606622 0.07762 7.815325 0.0 + 3 0_W_PS90 -1.068778 0.203911 -5.241381 0.0 + 4 0_W_UE90 -0.657932 0.176073 -3.73671 0.000186 + 5 1_CONSTANT -0.104299 1.790953 -0.058237 0.95356 + 6 1_PS90 1.219796 0.316425 3.854936 0.000116 + 7 1_UE90 0.678922 0.120491 5.634647 0.0 + 8 1_W_PS90 -1.308599 0.536231 -2.440366 0.014672 + 9 1_W_UE90 -0.708492 0.167057 -4.24102 0.000022 + 10 _Global_W_HR90 1.033956 0.269252 3.840111 0.000123 + 11 lambda -0.384968 0.192256 -2.002366 0.045245 + + + """ + + def __init__( + self, y, x, regimes, w, yend=None, q=None, estimator='het', constant_regi="many", cols2regi="all", regime_err_sep=False, + regime_lag_sep=False, add_wy=False, slx_lags=0, vm=False, name_y=None, name_x=None, name_w=None, name_regimes=None, name_yend=None, + name_q=None, name_ds=None, latex=False, **kwargs): + + if estimator == 'het': + if yend is None and not add_wy: + GM_Error_Het_Regimes.__init__(self, y=y, x=x, regimes=regimes, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex, **kwargs) + elif yend is not None and not add_wy: + GM_Endog_Error_Het_Regimes.__init__(self, y=y, x=x, regimes=regimes, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex, **kwargs) + elif add_wy: + GM_Combo_Het_Regimes.__init__(self, y=y, x=x, regimes=regimes, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, regime_lag_sep=regime_lag_sep, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex, **kwargs) + else: + set_warn(self, 'Combination of arguments passed to GMM_Error_Regimes not allowed. Using default arguments instead.') + GM_Error_Het_Regimes.__init__(self, y=y, x=x, regimes=regimes, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex) + elif estimator == 'hom': + if yend is None and not add_wy: + GM_Error_Hom_Regimes.__init__(self, y=y, x=x, regimes=regimes, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex, **kwargs) + elif yend is not None and not add_wy: + GM_Endog_Error_Hom_Regimes.__init__(self, y=y, x=x, regimes=regimes, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex, **kwargs) + elif add_wy: + GM_Combo_Hom_Regimes.__init__(self, y=y, x=x, regimes=regimes, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, regime_lag_sep=regime_lag_sep, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex, **kwargs) + else: + set_warn(self, 'Combination of arguments passed to GMM_Error_Regimes not allowed. Using default arguments instead.') + GM_Error_Hom_Regimes.__init__(self, y=y, x=x, regimes=regimes, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex) + elif estimator == 'kp98': + if yend is None and not add_wy: + GM_Error_Regimes.__init__(self, y=y, x=x, regimes=regimes, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex, **kwargs) + elif yend is not None and not add_wy: + GM_Endog_Error_Regimes.__init__(self, y=y, x=x, regimes=regimes, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex, **kwargs) + elif add_wy: + GM_Combo_Regimes.__init__(self, y=y, x=x, regimes=regimes, yend=yend, q=q, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, regime_lag_sep=regime_lag_sep, + name_yend=name_yend, name_q=name_q, name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex, **kwargs) + else: + set_warn(self, 'Combination of arguments passed to GMM_Error_Regimes not allowed. Using default arguments instead.') + GM_Error_Regimes.__init__(self, y=y, x=x, regimes=regimes, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex) + else: + set_warn(self, 'Combination of arguments passed to GMM_Error_Regimes not allowed. Using default arguments instead.') + GM_Error_Het_Regimes.__init__(self, y=y, x=x, regimes=regimes, w=w, slx_lags=slx_lags, vm=vm, name_y=name_y, name_x=name_x, + constant_regi=constant_regi, cols2regi=cols2regi, regime_err_sep=regime_err_sep, + name_w=name_w, name_regimes=name_regimes, name_ds=name_ds, latex=latex) def _work_error(y, x, regi_ids, r, w, name_ds, name_y, name_x, name_w, name_regimes): w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True) @@ -1505,7 +1962,7 @@ def _work_error(y, x, regi_ids, r, w, name_ds, name_y, name_x, name_w, name_regi model = BaseGM_Error(y_r, x_r, w_r.sparse) set_warn(model, warn) model.w = w_r - model.title = "SPATIALLY WEIGHTED LEAST SQUARES ESTIMATION - REGIME %s" % r + model.title = "GM SPATIALLY WEIGHTED LEAST SQUARES ESTIMATION - REGIME %s" % r model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) model.name_x = ["%s_%s" % (str(r), i) for i in name_x] @@ -1530,6 +1987,7 @@ def _work_endog_error( name_w, name_regimes, add_lag, + slx_lags, ): w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True) y_r = y[regi_ids[r]] @@ -1548,16 +2006,26 @@ def _work_endog_error( if add_lag != False: model.rho = model.betas[-2] model.predy_e, model.e_pred, warn = sp_att( - w_r, model.y, model.predy, model.yend[:, -1].reshape(model.n, 1), model.rho + w_r, model.y, model.predy, model.yend[:, - + 1].reshape(model.n, 1), model.rho ) set_warn(model, warn) model.w = w_r - model.title = "SPATIALLY WEIGHTED TWO STAGE LEAST SQUARES - REGIME %s" % r + if slx_lags == 0: + if add_lag != False: + model.title = "SPATIALLY WEIGHTED 2SLS - GM-COMBO MODEL - REGIME %s" % r + else: + model.title = "SPATIALLY WEIGHTED 2SLS (GM) - REGIME %s" % r + else: + if add_lag != False: + model.title = "GM SPATIAL COMBO MODEL + SLX (GNSM) - REGIME %s" % r + else: + model.title = "GM SPATIALLY WEIGHTED 2SLS + SLX (SDEM) - REGIME %s" % r model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) model.name_x = ["%s_%s" % (str(r), i) for i in name_x] model.name_yend = ["%s_%s" % (str(r), i) for i in name_yend] - model.name_z = model.name_x + model.name_yend + ["lambda"] + model.name_z = model.name_x + model.name_yend + [str(r)+"_lambda"] model.name_q = ["%s_%s" % (str(r), i) for i in name_q] model.name_h = model.name_x + model.name_q model.name_w = name_w @@ -1575,31 +2043,34 @@ def _test(): if __name__ == "__main__": - _test() - import libpysal import numpy as np + import libpysal - dbf = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r") - y = np.array([dbf.by_col("CRIME")]).T - names_to_extract = ["INC"] - x = np.array([dbf.by_col(name) for name in names_to_extract]).T - yd_var = ["HOVAL"] - yend = np.array([dbf.by_col(name) for name in yd_var]).T - q_var = ["DISCBD"] - q = np.array([dbf.by_col(name) for name in q_var]).T - regimes = regimes = dbf.by_col("NSA") - w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), "r").read() - w.transform = "r" - model = GM_Error_Regimes( - y, - x, - regimes=regimes, - w=w, - name_y="crime", - name_x=["income"], - name_regimes="nsa", - name_ds="columbus", - regime_err_sep=True, - ) - print(model.summary) + db = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'), 'r') + y = np.array(db.by_col("HOVAL")) + y = np.reshape(y, (49, 1)) + X = [] + X.append(db.by_col("INC")) + X = np.array(X).T + yd = [] + yd.append(db.by_col("CRIME")) + yd = np.array(yd).T + q = [] + q.append(db.by_col("DISCBD")) + q = np.array(q).T + + r_var = 'NSA' + regimes = db.by_col(r_var) + + w = libpysal.weights.Rook.from_shapefile( + libpysal.examples.get_path("columbus.shp")) + w.transform = 'r' + # reg = GM_Error_Regimes(y, X, regimes, w=w, name_x=['inc'], name_y='hoval', name_ds='columbus', + # regime_err_sep=True) + # reg = GM_Endog_Error_Regimes(y, X, yd, q, regimes, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], + # name_q=['discbd'], name_ds='columbus', regime_err_sep=True) + reg = GM_Combo_Regimes(y, X, regimes, yd, q, w=w, name_x=['inc'], name_y='hoval', name_yend=['crime'], + name_q=['discbd'], name_ds='columbus', regime_err_sep=True, regime_lag_sep=True) + print(reg.output) + print(reg.summary) diff --git a/spreg/ml_error.py b/spreg/ml_error.py index 2d505324..bb338677 100644 --- a/spreg/ml_error.py +++ b/spreg/ml_error.py @@ -10,12 +10,13 @@ import numpy.linalg as la from scipy import sparse as sp from scipy.sparse.linalg import splu as SuperLU -from .utils import RegressionPropsY, RegressionPropsVM, set_warn +from .utils import RegressionPropsY, RegressionPropsVM, set_warn, get_lags from . import diagnostics as DIAG from . import user_output as USER -from . import summary_output as SUMMARY from . import regimes as REGI from .w_utils import symmetrize +import pandas as pd +from .output import output, _nonspat_top try: from scipy.optimize import minimize_scalar @@ -310,6 +311,9 @@ class ML_Error(BaseML_Error): independent (exogenous) variable, excluding the constant w : Sparse matrix Spatial weights sparse matrix + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. method : string if 'full', brute force calculation (full matrix expressions) if 'ord', Ord eigenvalue method @@ -327,9 +331,13 @@ class ML_Error(BaseML_Error): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe betas : array (k+1)x1 array of estimated coefficients (rho first) lam : float @@ -463,6 +471,7 @@ def __init__( y, x, w, + slx_lags=0, method="full", epsilon=0.0000001, vm=False, @@ -470,17 +479,26 @@ def __init__( name_x=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x) y = USER.check_y(y, n) USER.check_weights(w, y, w_required=True) x_constant, name_x, warn = USER.check_constant(x, name_x) set_warn(self, warn) + + self.title = "ML SPATIAL ERROR" + if slx_lags >0: + lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title += " WITH SLX (SDEM)" + self.title += " (METHOD = " + method + ")" + method = method.upper() BaseML_Error.__init__( self, y=y, x=x_constant, w=w, method=method, epsilon=epsilon ) - self.title = "MAXIMUM LIKELIHOOD SPATIAL ERROR" + " (METHOD = " + method + ")" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x) @@ -488,7 +506,11 @@ def __init__( self.name_w = USER.set_name_w(name_w, w) self.aic = DIAG.akaike(reg=self) self.schwarz = DIAG.schwarz(reg=self) - SUMMARY.ML_Error(reg=self, w=w, vm=vm, spat_diag=False) + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * (len(self.name_x) - 1) + ['lambda'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _nonspat_top(self, ml=True) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) def err_c_loglik(lam, n, y, ylag, x, xlag, W): @@ -565,3 +587,29 @@ def _test(): np.set_printoptions(suppress=True) doctest.testmod() np.set_printoptions(suppress=start_suppress) + +if __name__ == "__main__": + _test() + + import numpy as np + import libpysal + + db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r") + y_var = "CRIME" + y = np.array([db.by_col(y_var)]).reshape(49, 1) + x_var = ["INC"] + x = np.array([db.by_col(name) for name in x_var]).T + w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) + w.transform = "r" + model = ML_Error( + y, + x, + w=w, + vm=False, + name_y=y_var, + name_x=x_var, + name_ds="columbus", + name_w="columbus.gal", + ) + print(model.output) + print(model.summary) diff --git a/spreg/ml_error_regimes.py b/spreg/ml_error_regimes.py index 03384495..721fd115 100644 --- a/spreg/ml_error_regimes.py +++ b/spreg/ml_error_regimes.py @@ -9,12 +9,13 @@ import multiprocessing as mp from . import regimes as REGI from . import user_output as USER -from . import summary_output as SUMMARY from . import diagnostics as DIAG -from .utils import set_warn +from .utils import set_warn, get_lags from .sputils import sphstack from .ml_error import BaseML_Error from platform import system +import pandas as pd +from .output import output, _nonspat_top __all__ = ["ML_Error_Regimes"] @@ -51,6 +52,9 @@ class ML_Error_Regimes(BaseML_Error, REGI.Regimes_Frame): If 'all' (default), all the variables vary by regime. w : Sparse matrix Spatial weights sparse matrix + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SDEM type. method : string if 'full', brute force calculation (full matrix expressions) if 'ord', Ord eigenvalue computation @@ -78,9 +82,13 @@ class ML_Error_Regimes(BaseML_Error, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regimes variable for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -276,6 +284,7 @@ def __init__( x, regimes, w=None, + slx_lags=0, constant_regi="many", cols2regi="all", method="full", @@ -289,14 +298,13 @@ def __init__( name_w=None, name_ds=None, name_regimes=None, + latex=False, ): n = USER.check_arrays(y, x) y = USER.check_y(y, n) USER.check_weights(w, y, w_required=True) self.constant_regi = constant_regi - self.cols2regi = cols2regi - self.regime_err_sep = regime_err_sep self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_w = USER.set_name_w(name_w, w) @@ -307,9 +315,16 @@ def __init__( x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + + if slx_lags >0: + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.name_x_r = USER.set_name_x(name_x, x_constant) - cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x) + cols2regi = REGI.check_cols2regi(constant_regi, cols2regi, x_constant) + self.cols2regi = cols2regi self.regimes_set = REGI._get_regimes_set(regimes) self.regimes = regimes USER.check_regimes(self.regimes_set, self.n, x.shape[1]) @@ -322,16 +337,18 @@ def __init__( x_constant, regimes, w, + slx_lags, cores, method, epsilon, cols2regi, vm, name_x, + latex, ) else: raise Exception( - "All coefficients must vary accross regimes if regime_err_sep = True." + "All coefficients must vary across regimes if regime_err_sep = True." ) else: x_constant = sphstack(np.ones((x_constant.shape[0], 1)), x_constant) @@ -340,15 +357,15 @@ def __init__( regimes_att["x"] = x_constant regimes_att["regimes"] = regimes regimes_att["cols2regi"] = cols2regi - x, name_x = REGI.Regimes_Frame.__init__( + x, name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi=None, cols2regi=cols2regi, names=name_x, + rlist=True ) - BaseML_Error.__init__( self, y=y, @@ -359,22 +376,26 @@ def __init__( regimes_att=regimes_att, ) - self.title = ( - "MAXIMUM LIKELIHOOD SPATIAL ERROR - REGIMES" - + " (METHOD = " - + method - + ")" - ) + self.title = "ML SPATIAL ERROR" + if slx_lags >0: + self.title += " WITH SLX (SDEM)" + self.title += " - REGIMES (METHOD = " + method + ")" + self.name_x = USER.set_name_x(name_x, x, constant=True) self.name_x.append("lambda") self.kf += 1 # Adding a fixed k to account for lambda. self.chow = REGI.Chow(self) self.aic = DIAG.akaike(reg=self) self.schwarz = DIAG.schwarz(reg=self) - SUMMARY.ML_Error(reg=self, w=w, vm=vm, spat_diag=False, regimes=True) + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * (len(self.name_x) - 1) + ['lambda'] + self.output['regime'] = x_rlist + ['_Global'] + self.output['equation'] = 0 + self.other_top = _nonspat_top(self, ml=True) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) def _error_regimes_multi( - self, y, x, regimes, w, cores, method, epsilon, cols2regi, vm, name_x + self, y, x, regimes, w, slx_lags, cores, method, epsilon, cols2regi, vm, name_x, latex ): regi_ids = dict( @@ -404,6 +425,7 @@ def _error_regimes_multi( regi_ids, r, w, + slx_lags, method, epsilon, self.name_ds, @@ -421,6 +443,7 @@ def _error_regimes_multi( regi_ids, r, w, + slx_lags, method, epsilon, self.name_ds, @@ -452,6 +475,7 @@ def _error_regimes_multi( results = {} counter = 0 + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -482,16 +506,18 @@ def _error_regimes_multi( ] = results[r].e_filtered self.name_y += results[r].name_y self.name_x += results[r].name_x + results[r].other_top = _nonspat_top(results[r], ml=True) + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x, + 'var_type': ['x'] * (len(results[r].name_x) - 1) + ['lambda'], + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.chow = REGI.Chow(self) self.multi = results - SUMMARY.ML_Error_multi( - reg=self, multireg=self.multi, vm=vm, spat_diag=False, regimes=True, w=w - ) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) def _work_error( - y, x, regi_ids, r, w, method, epsilon, name_ds, name_y, name_x, name_w, name_regimes + y, x, regi_ids, r, w, slx_lags, method, epsilon, name_ds, name_y, name_x, name_w, name_regimes ): w_r, warn = REGI.w_regime(w, regi_ids[r], r, transform=True) y_r = y[regi_ids[r]] @@ -499,13 +525,10 @@ def _work_error( model = BaseML_Error(y=y_r, x=x_r, w=w_r, method=method, epsilon=epsilon) set_warn(model, warn) model.w = w_r - model.title = ( - "MAXIMUM LIKELIHOOD SPATIAL ERROR - REGIME " - + str(r) - + " (METHOD = " - + method - + ")" - ) + model.title = "ML SPATIAL ERROR" + if slx_lags >0: + model.title += " WITH SLX (SDEM)" + model.title += " - REGIME " + str(r) + " (METHOD = " + method + ")" model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) model.name_x = ["%s_%s" % (str(r), i) for i in name_x] @@ -528,30 +551,23 @@ def _test(): if __name__ == "__main__": _test() import numpy as np - import libpysal + import libpysal as ps - db = libpysal.io.open(libpysal.examples.get_path("baltim.dbf"), "r") + db = ps.io.open(ps.examples.get_path("baltim.dbf"), "r") ds_name = "baltim.dbf" y_name = "PRICE" y = np.array(db.by_col(y_name)).T y.shape = (len(y), 1) x_names = ["NROOM", "NBATH", "PATIO", "FIREPL", "AC", "GAR", "AGE", "LOTSZ", "SQFT"] x = np.array([db.by_col(var) for var in x_names]).T - ww = ps.open(ps.examples.get_path("baltim_q.gal")) + ww = ps.io.open(ps.examples.get_path("baltim_q.gal")) w = ww.read() ww.close() w_name = "baltim_q.gal" w.transform = "r" + regimes = db.by_col("CITCOU") - regimes = [] - y_coord = np.array(db.by_col("Y")) - for i in y_coord: - if i > 544.5: - regimes.append("North") - else: - regimes.append("South") - - mlerror = ML_Error_Regimes( + model = ML_Error_Regimes( y, x, regimes, @@ -561,7 +577,9 @@ def _test(): name_x=x_names, name_w=w_name, name_ds=ds_name, - regime_err_sep=False, - name_regimes="North", + regime_err_sep=True, + constant_regi="many", + name_regimes="CITCOU", ) - print(mlerror.summary) + print(model.output) + print(model.summary) diff --git a/spreg/ml_lag.py b/spreg/ml_lag.py old mode 100644 new mode 100755 index 7d7e2e9f..c3ec794c --- a/spreg/ml_lag.py +++ b/spreg/ml_lag.py @@ -10,11 +10,12 @@ import numpy.linalg as la from scipy import sparse as sp from scipy.sparse.linalg import splu as SuperLU -from .utils import RegressionPropsY, RegressionPropsVM, inverse_prod, set_warn +from .utils import RegressionPropsY, RegressionPropsVM, inverse_prod, set_warn, get_lags from .sputils import spdot, spfill_diagonal, spinv, spbroadcast from . import diagnostics as DIAG from . import user_output as USER -from . import summary_output as SUMMARY +import pandas as pd +from .output import output, _nonspat_top, _spat_pseudo_r2 from .w_utils import symmetrize from libpysal import weights @@ -43,6 +44,9 @@ class BaseML_Lag(RegressionPropsY, RegressionPropsVM): independent (exogenous) variable, excluding the constant w : pysal W object Spatial weights object + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the Spatial Durbin type. method : string if 'full', brute force calculation (full matrix expressions) if 'ord', Ord eigenvalue method @@ -179,25 +183,29 @@ class BaseML_Lag(RegressionPropsY, RegressionPropsVM): """ - def __init__(self, y, x, w, method="full", epsilon=0.0000001): + def __init__(self, y, x, w, slx_lags=0, method="full", epsilon=0.0000001): # set up main regression variables and spatial filters self.y = y self.x = x - self.n, self.k = self.x.shape self.method = method self.epsilon = epsilon # W = w.full()[0] # Wsp = w.sparse ylag = weights.lag_spatial(w, y) # b0, b1, e0 and e1 + + if slx_lags>0: + self.x = np.hstack((self.x, get_lags(w, self.x[:, 1:], slx_lags))) + + self.n, self.k = self.x.shape xtx = spdot(self.x.T, self.x) xtxi = la.inv(xtx) xty = spdot(self.x.T, self.y) xtyl = spdot(self.x.T, ylag) b0 = spdot(xtxi, xty) b1 = spdot(xtxi, xtyl) - e0 = self.y - spdot(x, b0) - e1 = ylag - spdot(x, b1) + e0 = self.y - spdot(self.x, b0) + e1 = ylag - spdot(self.x, b1) methodML = method.upper() # call minimizer using concentrated log-likelihood to get rho if methodML in ["FULL", "LU", "ORD"]: @@ -209,19 +217,19 @@ def __init__(self, y, x, w, method="full", epsilon=0.0000001): bounds=(-1.0, 1.0), args=(self.n, e0, e1, W), method="bounded", - tol=epsilon, + options={'xatol': epsilon}, ) elif methodML == "LU": I = sp.identity(w.n) Wsp = w.sparse # moved here - W = Wsp + W = Wsp#.tocsc() res = minimize_scalar( lag_c_loglik_sp, 0.0, bounds=(-1.0, 1.0), args=(self.n, e0, e1, I, Wsp), method="bounded", - tol=epsilon, + options={'xatol': epsilon}, ) elif methodML == "ORD": # check on symmetry structure @@ -239,7 +247,7 @@ def __init__(self, y, x, w, method="full", epsilon=0.0000001): bounds=(-1.0, 1.0), args=(self.n, e0, e1, evals), method="bounded", - tol=epsilon, + options={'xatol': epsilon}, ) else: # program will crash, need to catch @@ -261,7 +269,7 @@ def __init__(self, y, x, w, method="full", epsilon=0.0000001): self.u = e0 - self.rho * e1 self.predy = self.y - self.u - xb = spdot(x, b) + xb = spdot(self.x, b) self.predy_e = inverse_prod( w.sparse, xb, self.rho, inv_method="power_exp", threshold=epsilon @@ -289,7 +297,7 @@ def __init__(self, y, x, w, method="full", epsilon=0.0000001): wpredy = weights.lag_spatial(w, self.predy_e) wpyTwpy = spdot(wpredy.T, wpredy) - xTwpy = spdot(x.T, wpredy) + xTwpy = spdot(self.x.T, wpredy) # order of variables is beta, rho, sigma2 @@ -321,6 +329,9 @@ class ML_Lag(BaseML_Lag): independent (exogenous) variable, excluding the constant w : pysal W object Spatial weights object + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the Spatial Durbin type. method : string if 'full', brute force calculation (full matrix expressions) if 'ord', Ord eigenvalue method @@ -337,9 +348,13 @@ class ML_Lag(BaseML_Lag): Name of weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe betas : array (k+1)x1 array of estimated coefficients (rho first) rho : float @@ -567,6 +582,7 @@ def __init__( y, x, w, + slx_lags=0, method="full", epsilon=0.0000001, vm=False, @@ -574,6 +590,7 @@ def __init__( name_x=None, name_w=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x) y = USER.check_y(y, n) @@ -582,11 +599,16 @@ def __init__( set_warn(self, warn) method = method.upper() BaseML_Lag.__init__( - self, y=y, x=x_constant, w=w, method=method, epsilon=epsilon + self, y=y, x=x_constant, w=w, slx_lags=slx_lags, method=method, epsilon=epsilon ) # increase by 1 to have correct aic and sc, include rho in count self.k += 1 - self.title = "MAXIMUM LIKELIHOOD SPATIAL LAG" + " (METHOD = " + method + ")" + + if slx_lags>0: + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.title = "MAXIMUM LIKELIHOOD SPATIAL LAG WITH SLX - SPATIAL DURBIN MODEL" + " (METHOD = " + method + ")" + else: + self.title = "MAXIMUM LIKELIHOOD SPATIAL LAG" + " (METHOD = " + method + ")" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) self.name_x = USER.set_name_x(name_x, x) @@ -595,8 +617,12 @@ def __init__( self.name_w = USER.set_name_w(name_w, w) self.aic = DIAG.akaike(reg=self) self.schwarz = DIAG.schwarz(reg=self) - SUMMARY.ML_Lag(reg=self, w=w, vm=vm, spat_diag=False) - + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * (len(self.name_x)-1) + ['rho'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _spat_pseudo_r2(self) + self.other_top += _nonspat_top(self, ml=True) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) def lag_c_loglik(rho, n, e0, e1, W): # concentrated log-lik for lag model, no constants, brute force @@ -647,3 +673,29 @@ def _test(): np.set_printoptions(suppress=True) doctest.testmod() np.set_printoptions(suppress=start_suppress) + +if __name__ == "__main__": + _test() + + import numpy as np + import libpysal + + db = libpysal.io.open(libpysal.examples.get_path("columbus.dbf"), "r") + y_var = "CRIME" + y = np.array([db.by_col(y_var)]).reshape(49, 1) + x_var = ["INC"] + x = np.array([db.by_col(name) for name in x_var]).T + w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) + w.transform = "r" + model = ML_Lag( + y, + x, + w=w, + vm=False, + name_y=y_var, + name_x=x_var, + name_ds="columbus", + name_w="columbus.gal", + ) + print(model.output) + print(model.summary) diff --git a/spreg/ml_lag_regimes.py b/spreg/ml_lag_regimes.py index 4e603d09..a05c00b2 100644 --- a/spreg/ml_lag_regimes.py +++ b/spreg/ml_lag_regimes.py @@ -7,12 +7,14 @@ import numpy as np from . import regimes as REGI from . import user_output as USER -from . import summary_output as SUMMARY from . import diagnostics as DIAG import multiprocessing as mp from .ml_lag import BaseML_Lag -from .utils import set_warn +from .utils import set_warn, get_lags from platform import system +import pandas as pd +from .output import output, _nonspat_top, _spat_pseudo_r2 + __all__ = ["ML_Lag_Regimes"] @@ -55,6 +57,9 @@ class ML_Lag_Regimes(BaseML_Lag, REGI.Regimes_Frame): if 'LU', LU sparse matrix decomposition epsilon : float tolerance criterion in mimimize_scalar function and inverse_product + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the Spatial Durbin type. regime_lag_sep: boolean If True, the spatial parameter for spatial lag is also computed according to different regimes. If False (default), @@ -76,9 +81,13 @@ class ML_Lag_Regimes(BaseML_Lag, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regimes variable for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -297,8 +306,8 @@ def __init__( cols2regi="all", method="full", epsilon=0.0000001, + slx_lags=0, regime_lag_sep=False, - regime_err_sep=False, cores=False, vm=False, name_y=None, @@ -306,6 +315,7 @@ def __init__( name_w=None, name_ds=None, name_regimes=None, + latex=False, ): n = USER.check_arrays(y, x) @@ -318,6 +328,11 @@ def __init__( set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + if slx_lags >0: + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + self.name_x_r = USER.set_name_x(name_x, x_constant) + [USER.set_name_yend_sp(name_y)] self.method = method self.epsilon = epsilon @@ -364,6 +379,7 @@ def __init__( w_i, w, regi_ids, + slx_lags=slx_lags, cores=cores, cols2regi=cols2regi, method=method, @@ -374,17 +390,19 @@ def __init__( name_regimes=self.name_regimes, name_w=name_w, name_ds=name_ds, + latex=latex, ) else: # if regime_lag_sep == True: # w = REGI.w_regimes_union(w, w_i, self.regimes_set) - x, self.name_x = REGI.Regimes_Frame.__init__( + x, self.name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi, cols2regi=cols2regi[:-1], names=name_x, + rlist=True ) self.name_x.append("_Global_" + USER.set_name_yend_sp(name_y)) BaseML_Lag.__init__(self, y=y, x=x, w=w, method=method, epsilon=epsilon) @@ -395,13 +413,17 @@ def __init__( self.aic = DIAG.akaike(reg=self) self.schwarz = DIAG.schwarz(reg=self) self.regime_lag_sep = regime_lag_sep - self.title = ( - "MAXIMUM LIKELIHOOD SPATIAL LAG - REGIMES" - + " (METHOD = " - + method - + ")" - ) - SUMMARY.ML_Lag(reg=self, w=w, vm=vm, spat_diag=False, regimes=True) + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * (len(self.name_x) - 1) + ['rho'] + self.output['regime'] = x_rlist+['_Global'] + self.output['equation'] = 0 + self.other_top = _spat_pseudo_r2(self) + self.other_top += _nonspat_top(self, ml=True) + if slx_lags == 0: + self.title = ("MAXIMUM LIKELIHOOD SPATIAL LAG - REGIMES"+ " (METHOD = "+ method+ ")") + else: + self.title = ("MAXIMUM LIKELIHOOD SPATIAL DURBIN - REGIMES"+ " (METHOD = "+ method+ ")") + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) def ML_Lag_Regimes_Multi( self, @@ -410,6 +432,7 @@ def ML_Lag_Regimes_Multi( w_i, w, regi_ids, + slx_lags, cores, cols2regi, method, @@ -420,8 +443,9 @@ def ML_Lag_Regimes_Multi( name_regimes, name_w, name_ds, + latex, ): - # pool = mp.Pool(cores) + #pool = mp.Pool(cores) results_p = {} """ for r in self.regimes_set: @@ -445,6 +469,7 @@ def ML_Lag_Regimes_Multi( regi_ids, r, w_i[r], + slx_lags, method, epsilon, name_ds, @@ -462,6 +487,7 @@ def ML_Lag_Regimes_Multi( regi_ids, r, w_i[r], + slx_lags, method, epsilon, name_ds, @@ -496,6 +522,7 @@ def ML_Lag_Regimes_Multi( results = {} self.name_y, self.name_x = [], [] counter = 0 + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -528,12 +555,16 @@ def ML_Lag_Regimes_Multi( ] = results[r].e_pred self.name_y += results[r].name_y self.name_x += results[r].name_x + results[r].other_top = _spat_pseudo_r2(results[r]) + results[r].other_top += _nonspat_top(results[r], ml=True) + results[r].other_mid = "" + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x, + 'var_type': ['x'] * (len(results[r].name_x) - 1) + ['rho'], + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.multi = results self.chow = REGI.Chow(self) - SUMMARY.ML_Lag_multi( - reg=self, multireg=self.multi, vm=vm, spat_diag=False, regimes=True, w=w - ) + output(reg=self, vm=vm, robust=False, other_end=False, latex=latex) def _work( @@ -542,6 +573,7 @@ def _work( regi_ids, r, w_r, + slx_lags, method, epsilon, name_ds, @@ -553,13 +585,10 @@ def _work( y_r = y[regi_ids[r]] x_r = x[regi_ids[r]] model = BaseML_Lag(y_r, x_r, w_r, method=method, epsilon=epsilon) - model.title = ( - "MAXIMUM LIKELIHOOD SPATIAL LAG - REGIME " - + str(r) - + " (METHOD = " - + method - + ")" - ) + if slx_lags == 0: + model.title = ("MAXIMUM LIKELIHOOD SPATIAL LAG - REGIME "+ str(r)+ " (METHOD = "+ method+ ")") + else: + model.title = ("MAXIMUM LIKELIHOOD SPATIAL DURBIN - REGIME "+ str(r)+ " (METHOD = "+ method+ ")") model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) model.name_x = ["%s_%s" % (str(r), i) for i in name_x] @@ -583,16 +612,16 @@ def _test(): if __name__ == "__main__": _test() import numpy as np - import libpysal + import libpysal as ps - db = libpysal.io.open(libpysal.examples.get_path("baltim.dbf"), "r") + db = ps.io.open(ps.examples.get_path("baltim.dbf"), "r") ds_name = "baltim.dbf" y_name = "PRICE" y = np.array(db.by_col(y_name)).T y.shape = (len(y), 1) x_names = ["NROOM", "NBATH", "PATIO", "FIREPL", "AC", "GAR", "AGE", "LOTSZ", "SQFT"] x = np.array([db.by_col(var) for var in x_names]).T - ww = ps.open(ps.examples.get_path("baltim_q.gal")) + ww = ps.io.open(ps.examples.get_path("baltim_q.gal")) w = ww.read() ww.close() w_name = "baltim_q.gal" @@ -610,7 +639,9 @@ def _test(): name_w=w_name, name_ds=ds_name, regime_lag_sep=True, + regime_err_sep=False, constant_regi="many", name_regimes="CITCOU", ) + print(mllag.output) print(mllag.summary) diff --git a/spreg/ols.py b/spreg/ols.py index eb739230..f22bac19 100644 --- a/spreg/ols.py +++ b/spreg/ols.py @@ -1,13 +1,13 @@ """Ordinary Least Squares regression classes.""" -__author__ = "Luc Anselin luc.anselin@asu.edu, David C. Folch david.folch@asu.edu" +__author__ = "Luc Anselin lanselin@gmail.com, Pedro Amaral pedrovma@gmail.com, David C. Folch david.folch@asu.edu" import numpy as np -import copy as COPY import numpy.linalg as la from . import user_output as USER -from . import summary_output as SUMMARY +from .output import output, _spat_diag_out, _nonspat_mid, _nonspat_top from . import robust as ROBUST -from .utils import spdot, sphstack, RegressionPropsY, RegressionPropsVM, set_warn +from .utils import spdot, RegressionPropsY, RegressionPropsVM, set_warn, get_lags +import pandas as pd __all__ = ["OLS"] @@ -144,6 +144,9 @@ class OLS(BaseOLS): gwk : pysal W object Kernel spatial weights needed for HAC estimation. Note: matrix must have ones along the main diagonal. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SLX type. sig2n_k : boolean If True, then use n-k to estimate sigma^2. If False, use n. nonspat_diag : boolean @@ -171,10 +174,13 @@ class OLS(BaseOLS): Name of kernel weights matrix for use in output name_ds : string Name of dataset for use in output - + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -349,8 +355,9 @@ class OLS(BaseOLS): ready to be printed: >>> print(ols.summary) - REGRESSION - ---------- + REGRESSION RESULTS + ------------------ + SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES ----------------------------------------- Data set : columbus @@ -360,7 +367,7 @@ class OLS(BaseOLS): S.D. dependent var : 18.4661 Degrees of Freedom : 46 R-squared : 0.3495 Adjusted R-squared : 0.3212 - Sum squared residual: 10647.015 F-statistic : 12.3582 + Sum squared residual: 10647 F-statistic : 12.3582 Sigma-square : 231.457 Prob(F-statistic) : 5.064e-05 S.E. of regression : 15.214 Log likelihood : -201.368 Sigma-square ML : 217.286 Akaike info criterion : 408.735 @@ -426,6 +433,7 @@ def __init__( w=None, robust=None, gwk=None, + slx_lags = 0, sig2n_k=True, nonspat_diag=True, spat_diag=False, @@ -437,34 +445,57 @@ def __init__( name_w=None, name_gwk=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x) y = USER.check_y(y, n) - USER.check_weights(w, y) USER.check_robust(robust, gwk) + if robust == "hac" and spat_diag: + set_warn( + self, + "Spatial diagnostics are not available for HAC estimation. Hence, spatial diagnostics have been disabled for this model.", + ) + spat_diag = False + if robust in ["hac", "white"] and white_test: + set_warn( + self, + "White test not available when standard errors are estimated by HAC or White correction.", + ) + white_test = False USER.check_spat_diag(spat_diag, w) x_constant, name_x, warn = USER.check_constant(x, name_x) + self.name_x = USER.set_name_x(name_x, x_constant) + if slx_lags >0: + USER.check_weights(w, y, w_required=True) + lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + self.name_x += USER.set_name_spatial_lags(self.name_x[1:], slx_lags) + else: + USER.check_weights(w, y, w_required=False) set_warn(self, warn) BaseOLS.__init__( self, y=y, x=x_constant, robust=robust, gwk=gwk, sig2n_k=sig2n_k ) - self.title = "ORDINARY LEAST SQUARES" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) - self.name_x = USER.set_name_x(name_x, x_constant) + self.title = "ORDINARY LEAST SQUARES" + if slx_lags > 0: + self.title += " WITH SPATIALLY LAGGED X (SLX)" self.robust = USER.set_robust(robust) self.name_w = USER.set_name_w(name_w, w) self.name_gwk = USER.set_name_w(name_gwk, gwk) - SUMMARY.OLS( - reg=self, - vm=vm, - w=w, - nonspat_diag=nonspat_diag, - spat_diag=spat_diag, - moran=moran, - white_test=white_test, - ) + self.output = pd.DataFrame(self.name_x, columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top, self.other_mid, other_end = ("", "", "") # strings where function-specific diag. are stored + if nonspat_diag: + self.other_mid += _nonspat_mid(self, white_test=white_test) + self.other_top += _nonspat_top(self) + if spat_diag: + other_end += _spat_diag_out(self, w, 'ols', moran=moran) + output(reg=self, vm=vm, robust=robust, other_end=other_end, latex=latex) + def _test(): @@ -505,4 +536,5 @@ def _test(): sig2n_k=True, moran=True, ) + print(ols.output) print(ols.summary) diff --git a/spreg/ols_regimes.py b/spreg/ols_regimes.py index bf06f1f2..20b2acea 100755 --- a/spreg/ols_regimes.py +++ b/spreg/ols_regimes.py @@ -2,23 +2,20 @@ Ordinary Least Squares regression with regimes. """ -__author__ = "Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu, Daniel Arribas-Bel darribas@asu.edu" +__author__ = "Luc Anselin, Pedro V. Amaral, Daniel Arribas-Bel" +import numpy as np +import multiprocessing as mp +import pandas as pd from . import regimes as REGI from . import user_output as USER +from .utils import set_warn, RegressionProps_basic, spdot, RegressionPropsY, get_lags from .ols import BaseOLS -from .utils import set_warn, spbroadcast, RegressionProps_basic, RegressionPropsY, spdot from .robust import hac_multi -from . import summary_output as SUMMARY -import numpy as np -import multiprocessing as mp -from platform import system -import scipy.sparse as SP -import copy as COPY +from .output import output, _spat_diag_out, _nonspat_mid, _nonspat_top class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY): - """ Ordinary least squares with results and diagnostics. @@ -43,6 +40,10 @@ class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY): gwk : pysal W object Kernel spatial weights needed for HAC estimation. Note: matrix must have ones along the main diagonal. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SLX type. + Note: WX is computed using the complete weights matrix sig2n_k : boolean If True, then use n-k to estimate sigma^2. If False, use n. nonspat_diag : boolean @@ -92,10 +93,13 @@ class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY): Name of dataset for use in output name_regimes : string Name of regime variable for use in the output - + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -307,7 +311,7 @@ class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY): >>> y_var = 'HR90' >>> y = db.by_col(y_var) - >>> y = np.array(y).reshape(len(y), 1) + >>> y = np.array(y) Extract UE90 (unemployment rate) and PS90 (population structure) vectors from the DBF to be used as independent variables in the regression. Other variables @@ -327,58 +331,122 @@ class OLS_Regimes(BaseOLS, REGI.Regimes_Frame, RegressionPropsY): We can now run the regression and then have a summary of the output by typing: olsr.summary - Alternatively, we can just check the betas and standard errors of the - parameters: >>> olsr = OLS_Regimes(y, x, regimes, nonspat_diag=False, name_y=y_var, name_x=['PS90','UE90'], name_regimes=r_var, name_ds='NAT') - >>> olsr.betas - array([[0.39642899], - [0.65583299], - [0.48703937], - [5.59835 ], - [1.16210453], - [0.53163886]]) - >>> np.sqrt(olsr.vm.diagonal()) - array([0.24816345, 0.09662678, 0.03628629, 0.46894564, 0.21667395, - 0.05945651]) - >>> olsr.cols2regi - 'all' + >>> print(olsr.summary) + REGRESSION RESULTS + ------------------ + + SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES ESTIMATION - REGIME 0 + --------------------------------------------------------------- + Data set : NAT + Weights matrix : None + Dependent Variable : 0_HR90 Number of Observations: 1673 + Mean dependent var : 3.3416 Number of Variables : 3 + S.D. dependent var : 4.6795 Degrees of Freedom : 1670 + R-squared : 0.1271 + Adjusted R-squared : 0.1260 + + ------------------------------------------------------------------------------------ + Variable Coefficient Std.Error t-Statistic Probability + ------------------------------------------------------------------------------------ + 0_CONSTANT 0.3964290 0.2481634 1.5974512 0.1103544 + 0_PS90 0.6558330 0.0966268 6.7872800 0.0000000 + 0_UE90 0.4870394 0.0362863 13.4221336 0.0000000 + ------------------------------------------------------------------------------------ + Regimes variable: SOUTH + + SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES ESTIMATION - REGIME 1 + --------------------------------------------------------------- + Data set : NAT + Weights matrix : None + Dependent Variable : 1_HR90 Number of Observations: 1412 + Mean dependent var : 9.5493 Number of Variables : 3 + S.D. dependent var : 7.0389 Degrees of Freedom : 1409 + R-squared : 0.0661 + Adjusted R-squared : 0.0647 + + ------------------------------------------------------------------------------------ + Variable Coefficient Std.Error t-Statistic Probability + ------------------------------------------------------------------------------------ + 1_CONSTANT 5.5983500 0.4689456 11.9381640 0.0000000 + 1_PS90 1.1621045 0.2166740 5.3633790 0.0000001 + 1_UE90 0.5316389 0.0594565 8.9416422 0.0000000 + ------------------------------------------------------------------------------------ + Regimes variable: SOUTH + ------------------------------------------------------------------------------------ + GLOBAL DIAGNOSTICS + + REGIMES DIAGNOSTICS - CHOW TEST + VARIABLE DF VALUE PROB + CONSTANT 1 96.129 0.0000 + PS90 1 4.554 0.0328 + UE90 1 0.410 0.5220 + Global test 3 680.960 0.0000 + ================================ END OF REPORT ===================================== """ def __init__( - self, - y, - x, - regimes, - w=None, - robust=None, - gwk=None, - sig2n_k=True, - nonspat_diag=True, - spat_diag=False, - moran=False, - white_test=False, - vm=False, - constant_regi="many", - cols2regi="all", - regime_err_sep=True, - cores=False, - name_y=None, - name_x=None, - name_regimes=None, - name_w=None, - name_gwk=None, - name_ds=None, + self, + y, + x, + regimes, + w=None, + robust=None, + gwk=None, + slx_lags=0, + sig2n_k=True, + nonspat_diag=True, + spat_diag=False, + moran=False, + white_test=False, + vm=False, + constant_regi="many", + cols2regi="all", + regime_err_sep=True, + cores=False, + name_y=None, + name_x=None, + name_regimes=None, + name_w=None, + name_gwk=None, + name_ds=None, + latex=False ): n = USER.check_arrays(y, x) y = USER.check_y(y, n) - USER.check_weights(w, y) USER.check_robust(robust, gwk) + if robust == "hac": + if regime_err_sep: + set_warn( + self, + "Error by regimes is not available for HAC estimation. The error by regimes has been disabled for this model.", + ) + regime_err_sep = False + if spat_diag: + set_warn( + self, + "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", + ) + spat_diag = False + if robust in ["hac", "white"] and white_test: + set_warn( + self, + "White test not available when standard errors are estimated by HAC or White correction.", + ) + white_test = False USER.check_spat_diag(spat_diag, w) x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) - set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + if slx_lags > 0: + USER.check_weights(w, y, w_required=True) + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + else: + USER.check_weights(w, y, w_required=False) + set_warn(self, warn) self.name_x_r = USER.set_name_x(name_x, x_constant) self.constant_regi = constant_regi self.cols2regi = cols2regi @@ -394,17 +462,11 @@ def __init__( self.regimes_set = REGI._get_regimes_set(regimes) self.regimes = regimes USER.check_regimes(self.regimes_set, self.n, x_constant.shape[1]) - if regime_err_sep == True and robust == "hac": - set_warn( - self, - "Error by regimes is incompatible with HAC estimation. Hence, error by regimes has been disabled for this model.", - ) - regime_err_sep = False self.regime_err_sep = regime_err_sep if ( - regime_err_sep == True - and set(cols2regi) == set([True]) - and constant_regi == "many" + regime_err_sep == True + and set(cols2regi) == set([True]) + and constant_regi == "many" ): self.y = y regi_ids = dict( @@ -416,6 +478,7 @@ def __init__( regi_ids, cores, gwk, + slx_lags, sig2n_k, robust, nonspat_diag, @@ -424,11 +487,19 @@ def __init__( name_x, moran, white_test, + latex ) else: - x, self.name_x = REGI.Regimes_Frame.__init__( - self, x_constant, regimes, constant_regi, cols2regi, name_x + x, self.name_x, x_rlist = REGI.Regimes_Frame.__init__( + self, x_constant, regimes, constant_regi, cols2regi, name_x, rlist=True ) + + self.output = pd.DataFrame(self.name_x, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + self.output['regime'] = x_rlist + self.output['equation'] = 0 + BaseOLS.__init__(self, y=y, x=x, robust=robust, gwk=gwk, sig2n_k=sig2n_k) if regime_err_sep == True and robust == None: y2, x2 = REGI._get_weighted_var( @@ -439,41 +510,45 @@ def __init__( self.title = ( "ORDINARY LEAST SQUARES - REGIMES (Group-wise heteroskedasticity)" ) + if slx_lags > 0: + self.title = "ORDINARY LEAST SQUARES WITH SLX - REGIMES (Group-wise heteroskedasticity)" nonspat_diag = None set_warn( self, "Residuals treated as homoskedastic for the purpose of diagnostics.", ) else: - self.title = "ORDINARY LEAST SQUARES - REGIMES" + if slx_lags == 0: + self.title = "ORDINARY LEAST SQUARES - REGIMES" + else: + self.title = "ORDINARY LEAST SQUARES WITH SLX - REGIMES" self.robust = USER.set_robust(robust) self.chow = REGI.Chow(self) - SUMMARY.OLS( - reg=self, - vm=vm, - w=w, - nonspat_diag=nonspat_diag, - spat_diag=spat_diag, - moran=moran, - white_test=white_test, - regimes=True, - ) + self.other_top, self.other_mid, other_end = ("", "", "") # strings where function-specific diag. are stored + if nonspat_diag: + self.other_mid += _nonspat_mid(self, white_test=white_test) + self.other_top += _nonspat_top(self) + if spat_diag: + other_end += _spat_diag_out(self, w, 'ols', moran=moran) + output(reg=self, vm=vm, robust=robust, other_end=other_end, latex=latex) def _ols_regimes_multi( - self, - x, - w, - regi_ids, - cores, - gwk, - sig2n_k, - robust, - nonspat_diag, - spat_diag, - vm, - name_x, - moran, - white_test, + self, + x, + w, + regi_ids, + cores, + gwk, + slx_lags, + sig2n_k, + robust, + nonspat_diag, + spat_diag, + vm, + name_x, + moran, + white_test, + latex ): results_p = {} """ @@ -506,6 +581,7 @@ def _ols_regimes_multi( name_x, self.name_w, self.name_regimes, + slx_lags ), ) else: @@ -523,6 +599,7 @@ def _ols_regimes_multi( name_x, self.name_w, self.name_regimes, + slx_lags ) ) self.kryd = 0 @@ -545,6 +622,7 @@ def _ols_regimes_multi( results = {} self.name_y, self.name_x = [], [] counter = 0 + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -558,11 +636,11 @@ def _ols_regimes_multi( results[r] = results_p[r].get() self.vm[ - (counter * self.kr) : ((counter + 1) * self.kr), - (counter * self.kr) : ((counter + 1) * self.kr), + (counter * self.kr): ((counter + 1) * self.kr), + (counter * self.kr): ((counter + 1) * self.kr), ] = results[r].vm self.betas[ - (counter * self.kr) : ((counter + 1) * self.kr), + (counter * self.kr): ((counter + 1) * self.kr), ] = results[r].betas self.u[ regi_ids[r], @@ -572,25 +650,24 @@ def _ols_regimes_multi( ] = results[r].predy self.name_y += results[r].name_y self.name_x += results[r].name_x + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x, + 'var_type': ['x']*len(results[r].name_x), + 'regime': r, 'equation': r})], ignore_index=True) + results[r].other_top, results[r].other_mid = ("", "") + if nonspat_diag: + results[r].other_mid += _nonspat_mid(results[r], white_test=white_test) + results[r].other_top += _nonspat_top(results[r]) counter += 1 self.multi = results self.hac_var = x_constant[:, 1:] if robust == "hac": hac_multi(self, gwk) self.chow = REGI.Chow(self) + other_end = "" if spat_diag: self._get_spat_diag_props(x_constant, sig2n_k) - SUMMARY.OLS_multi( - reg=self, - multireg=self.multi, - vm=vm, - nonspat_diag=nonspat_diag, - spat_diag=spat_diag, - moran=moran, - white_test=white_test, - regimes=True, - w=w, - ) + other_end += _spat_diag_out(self, w, 'ols', moran=moran) + output(reg=self, vm=vm, robust=robust, other_end=other_end, latex=latex) def _get_spat_diag_props(self, x, sig2n_k): self.k = self.kr @@ -603,7 +680,7 @@ def _get_spat_diag_props(self, x, sig2n_k): def _work( - y, x, w, regi_ids, r, robust, sig2n_k, name_ds, name_y, name_x, name_w, name_regimes + y, x, w, regi_ids, r, robust, sig2n_k, name_ds, name_y, name_x, name_w, name_regimes, slx_lags ): y_r = y[regi_ids[r]] x_r = x[regi_ids[r]] @@ -612,7 +689,10 @@ def _work( if robust == "hac": robust = None model = BaseOLS(y_r, x_r, robust=robust, sig2n_k=sig2n_k) - model.title = "ORDINARY LEAST SQUARES ESTIMATION - REGIME %s" % r + if slx_lags == 0: + model.title = "ORDINARY LEAST SQUARES ESTIMATION - REGIME %s" % r + else: + model.title = "ORDINARY LEAST SQUARES ESTIMATION WITH SLX - REGIME %s" % r model.robust = USER.set_robust(robust) model.name_ds = name_ds model.name_y = "%s_%s" % (str(r), name_y) @@ -639,32 +719,35 @@ def _test(): _test() import numpy as np import libpysal + import pysal db = libpysal.io.open(libpysal.examples.get_path("NAT.dbf"), "r") - y_var = "CRIME" - y = np.array([db.by_col(y_var)]).reshape(49, 1) - x_var = ["INC", "HOVAL"] + y_var = "HR90" + y = np.array(db.by_col(y_var)).reshape(-1,1) + x_var = ['PS90','UE90'] x = np.array([db.by_col(name) for name in x_var]).T - r_var = "NSA" + r_var = "SOUTH" regimes = db.by_col(r_var) - w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("columbus.shp")) + w = libpysal.weights.Rook.from_shapefile(libpysal.examples.get_path("NAT.shp")) w.transform = "r" + #olsr = pysal.model.spreg.OLS_Regimes( olsr = OLS_Regimes( - y, + y, x, regimes, w=w, constant_regi="many", - nonspat_diag=False, - spat_diag=False, + nonspat_diag=True, + spat_diag=True, name_y=y_var, - name_x=["INC", "HOVAL"], - name_ds="columbus", + name_x=x_var, + name_ds="NAT", name_regimes=r_var, - name_w="columbus.gal", regime_err_sep=True, cols2regi=[True, True], - sig2n_k=True, - robust="white", + sig2n_k=False, + white_test=True, + #robust="white" ) + print(olsr.output) print(olsr.summary) diff --git a/spreg/output.py b/spreg/output.py new file mode 100644 index 00000000..c871c842 --- /dev/null +++ b/spreg/output.py @@ -0,0 +1,476 @@ +"""Internal helper files for user output.""" + +__author__ = "Luc Anselin, Pedro V. Amaral" + +import textwrap as TW +import numpy as np +import pandas as pd +from . import diagnostics as diagnostics +from . import diagnostics_tsls as diagnostics_tsls +from . import diagnostics_sp as diagnostics_sp + +__all__ = [] + +############################################################################### +############### Primary functions for running summary diagnostics ############# +############################################################################### + +def output(reg, vm, other_end=False, robust=False, latex=False): + strSummary = output_start(reg) + for eq in reg.output['equation'].unique(): + try: + reg.multi[eq].__summary = {} + strSummary, reg.multi[eq] = out_part_top(strSummary, reg, eq) + except: + eq = None + strSummary, reg = out_part_top(strSummary, reg, eq) + strSummary, reg = out_part_middle(strSummary, reg, robust, m=eq) + strSummary, reg = out_part_end(strSummary, reg, vm, other_end, m=eq) + reg.summary = strSummary + reg.output.sort_values(by=['equation', 'regime'], inplace=True) + reg.output.drop(['var_type', 'regime', 'equation'], axis=1, inplace=True) + +def output_start(reg): + reg.__summary = {} + strSummary = "REGRESSION RESULTS\n" + strSummary += "------------------\n" + reg.output = reg.output.assign(coefficients=[None] * len(reg.output), std_err=[None] * len(reg.output), + zt_stat=[None] * len(reg.output), prob=[None] * len(reg.output)) + return strSummary + +def out_part_top(strSummary, reg, m): + # Top part of summary output. + # m = None for single models, m = 1,2,3... for multiple equation models + if m == None: + _reg = reg # _reg = local object with regression results + else: + _reg = reg.multi[m] # _reg = local object with equation specific regression results + title = "\nSUMMARY OF OUTPUT: " + _reg.title + "\n" + strSummary += title + strSummary += "-" * (len(title) - 2) + "\n" + strSummary += "%-20s:%12s\n" % ("Data set", _reg.name_ds) + try: + strSummary += "%-20s:%12s\n" % ("Weights matrix", _reg.name_w) + except: + pass + + strSummary += "%-20s:%12s %-22s:%12d\n" % ( + "Dependent Variable", + _reg.name_y, + "Number of Observations", + _reg.n, + ) + + strSummary += "%-20s:%12.4f %-22s:%12d\n" % ( + "Mean dependent var", + _reg.mean_y, + "Number of Variables", + _reg.k, + ) + strSummary += "%-20s:%12.4f %-22s:%12d\n" % ( + "S.D. dependent var", + _reg.std_y, + "Degrees of Freedom", + _reg.n - _reg.k, + ) + + _reg.std_err = diagnostics.se_betas(_reg) + if 'OLS' in reg.__class__.__name__: + _reg.t_stat = diagnostics.t_stat(_reg) + _reg.r2 = diagnostics.r2(_reg) + _reg.ar2 = diagnostics.ar2(_reg) + strSummary += "%-20s:%12.4f\n%-20s:%12.4f\n" % ( + "R-squared", + _reg.r2, + "Adjusted R-squared", + _reg.ar2, + ) + _reg.__summary["summary_zt"] = "t" + + else: + _reg.z_stat = diagnostics.t_stat(_reg, z_stat=True) + _reg.pr2 = diagnostics_tsls.pr2_aspatial(_reg) + strSummary += "%-20s:%12.4f\n" % ("Pseudo R-squared", _reg.pr2) + _reg.__summary["summary_zt"] = "z" + + try: # Adding additional top part if there is one + strSummary += _reg.other_top + except: + pass + + return (strSummary, _reg) + +def out_part_middle(strSummary, reg, robust, m=None): + # Middle part of summary output. + # m = None for single models, m = 1,2,3... for multiple equation models + if m==None: + _reg = reg #_reg = local object with regression results + m = reg.output['equation'].unique()[0] + else: + _reg = reg.multi[m] #_reg = local object with equation specific regression results + coefs = pd.DataFrame(_reg.betas, columns=['coefficients']) + coefs['std_err'] = pd.DataFrame(_reg.std_err) + try: + coefs = pd.concat([coefs, pd.DataFrame(_reg.z_stat, columns=['zt_stat', 'prob'])], axis=1) + except AttributeError: + coefs = pd.concat([coefs, pd.DataFrame(_reg.t_stat, columns=['zt_stat', 'prob'])], axis=1) + coefs.index = reg.output[reg.output['equation'] == m].index + reg.output.update(coefs) + strSummary += "\n" + if robust: + if robust == "white": + strSummary += "White Standard Errors\n" + elif robust == "hac": + strSummary += "HAC Standard Errors; Kernel Weights: " + _reg.name_gwk + "\n" + elif robust == "ogmm": + strSummary += "Optimal GMM used to estimate the coefficients and the variance-covariance matrix\n" + strSummary += "------------------------------------------------------------------------------------\n" + strSummary += ( + " Variable Coefficient Std.Error %1s-Statistic Probability\n" + % (_reg.__summary["summary_zt"]) + ) + strSummary += "------------------------------------------------------------------------------------\n" + m_output = reg.output[reg.output['equation'] == m] + for row in m_output.iloc[np.lexsort((m_output.index, m_output['regime']))].itertuples(): + try: + strSummary += "%20s %12.5f %12.5f %12.5f %12.5f\n" % ( + row.var_names, + row.coefficients, + row.std_err, + row.zt_stat, + row.prob + ) + except TypeError: # special case for models that do not have inference on the lambda term + strSummary += "%20s %12.5f \n" % ( + row.var_names, + row.coefficients + ) + strSummary += "------------------------------------------------------------------------------------\n" + + try: # Adding info on instruments if they are present + name_q = _reg.name_q + name_yend = _reg.name_yend + insts = "Instruments: " + for name in sorted(name_q): + insts += name + ", " + text_wrapper = TW.TextWrapper(width=76, subsequent_indent=" ") + insts = text_wrapper.fill(insts[:-2]) + insts += "\n" + inst2 = "Instrumented: " + for name in sorted(name_yend): + inst2 += name + ", " + text_wrapper = TW.TextWrapper(width=76, subsequent_indent=" ") + inst2 = text_wrapper.fill(inst2[:-2]) + inst2 += "\n" + inst2 += insts + strSummary += inst2 + except: + pass + + try: # Adding info on regimes if they are present + strSummary += ("Regimes variable: %s\n" % _reg.name_regimes) + strSummary += _summary_chow(_reg) # If local regimes present, compute Chow test. + except: + pass + + try: # Adding local warning if there is one + if _reg.warning: + strSummary += _reg.warning + except: + pass + + try: # Adding other middle part if there are any + strSummary += _reg.other_mid + except: + pass + + return (strSummary, reg) + +def out_part_end(strSummary, reg, vm, other_end, m=None): + if m is not None: + strSummary += "------------------------------------------------------------------------------------\n" + try: # Adding global warning if there is one + strSummary += reg.warning + except: + pass + strSummary += "\nGLOBAL DIAGNOSTICS\n" + try: # Adding global Chow test if there is one + strSummary += _summary_chow(reg) + except: + pass + if other_end: + strSummary += other_end + if vm: + strSummary += _summary_vm(reg) + strSummary += "================================ END OF REPORT =====================================" + return (strSummary, reg) + +def _summary_chow(reg): + sum_text = "\nREGIMES DIAGNOSTICS - CHOW TEST" + name_x_r = reg.name_x_r + joint, regi = reg.chow.joint, reg.chow.regi + sum_text += "\n VARIABLE DF VALUE PROB\n" + if reg.cols2regi == "all": + names_chow = name_x_r[1:] + else: + names_chow = [name_x_r[1:][i] for i in np.where(reg.cols2regi)[0]] + if reg.constant_regi == "many": + names_chow = ["CONSTANT"] + names_chow + + if 'lambda' in reg.output.var_type.values: + if reg.output.var_type.value_counts()['lambda'] > 1: + names_chow += ["lambda"] + + reg.output_chow = pd.DataFrame() + reg.output_chow['var_names'] = names_chow + reg.output_chow['df'] = reg.nr - 1 + reg.output_chow = pd.concat([reg.output_chow, pd.DataFrame(regi, columns=['value', 'prob'])], axis=1) + reg.output_chow = pd.concat([reg.output_chow, pd.DataFrame([{'var_names': 'Global test', + 'df': reg.kr * (reg.nr - 1), + 'value': joint[0], 'prob': joint[1]}])], ignore_index=True) + for row in reg.output_chow.itertuples(): + sum_text += "%25s %2d %12.3f %9.4f\n" % ( + row.var_names, + row.df, + row.value, + row.prob + ) + + return sum_text + + +def _spat_diag_out(reg, w, type, moran=False): + strSummary = "\nDIAGNOSTICS FOR SPATIAL DEPENDENCE\n" + if not moran: + strSummary += ( + "TEST DF VALUE PROB\n" + ) + else: + strSummary += ( + "TEST MI/DF VALUE PROB\n" + ) + + cache = diagnostics_sp.spDcache(reg, w) + if type == "yend": + mi, ak, ak_p = diagnostics_sp.akTest(reg, w, cache) + reg.ak_test = ak, ak_p + strSummary += "%-27s %2d %12.3f %9.4f\n" % ( + "Anselin-Kelejian Test", + 1, + reg.ak_test[0], + reg.ak_test[1], + ) + elif type == "ols": + lm_tests = diagnostics_sp.LMtests(reg, w) + reg.lm_error = lm_tests.lme + reg.lm_lag = lm_tests.lml + reg.rlm_error = lm_tests.rlme + reg.rlm_lag = lm_tests.rlml + reg.lm_sarma = lm_tests.sarma + if moran: + moran_res = diagnostics_sp.MoranRes(reg, w, z=True) + reg.moran_res = moran_res.I, moran_res.zI, moran_res.p_norm + strSummary += "%-27s %8.4f %9.3f %9.4f\n" % ( + "Moran's I (error)", + reg.moran_res[0], + reg.moran_res[1], + reg.moran_res[2], + ) + strSummary += "%-27s %2d %12.3f %9.4f\n" % ( + "Lagrange Multiplier (lag)", + 1, + reg.lm_lag[0], + reg.lm_lag[1], + ) + strSummary += "%-27s %2d %12.3f %9.4f\n" % ( + "Robust LM (lag)", + 1, + reg.rlm_lag[0], + reg.rlm_lag[1], + ) + strSummary += "%-27s %2d %12.3f %9.4f\n" % ( + "Lagrange Multiplier (error)", + 1, + reg.lm_error[0], + reg.lm_error[1], + ) + strSummary += "%-27s %2d %12.3f %9.4f\n" % ( + "Robust LM (error)", + 1, + reg.rlm_error[0], + reg.rlm_error[1], + ) + strSummary += "%-27s %2d %12.3f %9.4f\n\n" % ( + "Lagrange Multiplier (SARMA)", + 2, + reg.lm_sarma[0], + reg.lm_sarma[1], + ) + return strSummary + +def _nonspat_top(reg, ml=False): + if not ml: + reg.sig2ML = reg.sig2n + reg.f_stat = diagnostics.f_stat(reg) + reg.logll = diagnostics.log_likelihood(reg) + reg.aic = diagnostics.akaike(reg) + reg.schwarz = diagnostics.schwarz(reg) + + strSummary = "%-20s:%12.6g %-22s:%12.4f\n" % ( + "Sum squared residual", reg.utu, "F-statistic", reg.f_stat[0],) + strSummary += "%-20s:%12.3f %-22s:%12.4g\n" % ( + "Sigma-square", + reg.sig2, + "Prob(F-statistic)", + reg.f_stat[1], + ) + strSummary += "%-20s:%12.3f %-22s:%12.3f\n" % ( + "S.E. of regression", + np.sqrt(reg.sig2), + "Log likelihood", + reg.logll, + ) + strSummary += "%-20s:%12.3f %-22s:%12.3f\n" % ( + "Sigma-square ML", + reg.sig2ML, + "Akaike info criterion", + reg.aic, + ) + strSummary += "%-20s:%12.4f %-22s:%12.3f\n" % ( + "S.E of regression ML", + np.sqrt(reg.sig2ML), + "Schwarz criterion", + reg.schwarz, + ) + else: + strSummary = "%-20s:%12.4f\n" % ( + "Log likelihood", + reg.logll, + ) + strSummary += "%-20s:%12.4f %-22s:%12.3f\n" % ( + "Sigma-square ML", + reg.sig2, + "Akaike info criterion", + reg.aic, + ) + strSummary += "%-20s:%12.4f %-22s:%12.3f\n" % ( + "S.E of regression", + np.sqrt(reg.sig2), + "Schwarz criterion", + reg.schwarz, + ) + + return strSummary + +def _nonspat_mid(reg, white_test=False): + # compute diagnostics + reg.mulColli = diagnostics.condition_index(reg) + reg.jarque_bera = diagnostics.jarque_bera(reg) + reg.breusch_pagan = diagnostics.breusch_pagan(reg) + reg.koenker_bassett = diagnostics.koenker_bassett(reg) + + if white_test: + reg.white = diagnostics.white(reg) + + strSummary = "\nREGRESSION DIAGNOSTICS\n" + if reg.mulColli: + strSummary += "MULTICOLLINEARITY CONDITION NUMBER %16.3f\n\n" % (reg.mulColli) + strSummary += "TEST ON NORMALITY OF ERRORS\n" + strSummary += "TEST DF VALUE PROB\n" + strSummary += "%-27s %2d %14.3f %9.4f\n\n" % ( + "Jarque-Bera", + reg.jarque_bera["df"], + reg.jarque_bera["jb"], + reg.jarque_bera["pvalue"], + ) + strSummary += "DIAGNOSTICS FOR HETEROSKEDASTICITY\n" + strSummary += "RANDOM COEFFICIENTS\n" + strSummary += "TEST DF VALUE PROB\n" + strSummary += "%-27s %2d %12.3f %9.4f\n" % ( + "Breusch-Pagan test", + reg.breusch_pagan["df"], + reg.breusch_pagan["bp"], + reg.breusch_pagan["pvalue"], + ) + strSummary += "%-27s %2d %12.3f %9.4f\n" % ( + "Koenker-Bassett test", + reg.koenker_bassett["df"], + reg.koenker_bassett["kb"], + reg.koenker_bassett["pvalue"], + ) + try: + if reg.white: + strSummary += "\nSPECIFICATION ROBUST TEST\n" + if len(reg.white) > 3: + strSummary += reg.white + "\n" + else: + strSummary += ( + "TEST DF VALUE PROB\n" + ) + strSummary += "%-27s %2d %12.3f %9.4f\n" % ( + "White", + reg.white["df"], + reg.white["wh"], + reg.white["pvalue"], + ) + except: + pass + return strSummary + +def _spat_pseudo_r2(reg): + if np.abs(reg.rho) < 1: + reg.pr2_e = diagnostics_tsls.pr2_spatial(reg) + strSummary = "%-20s: %5.4f\n" % ("Spatial Pseudo R-squared", reg.pr2_e) + else: + strSummary = "Spatial Pseudo R-squared: omitted due to rho outside the boundary (-1, 1).\n" + return strSummary + +def _summary_vm(reg): + strVM = "\n" + strVM += "COEFFICIENTS VARIANCE MATRIX\n" + strVM += "----------------------------\n" + try: + for name in reg.name_z: + strVM += "%12s" % (name) + except: + for name in reg.name_x: + strVM += "%12s" % (name) + strVM += "\n" + nrow = reg.vm.shape[0] + ncol = reg.vm.shape[1] + for i in range(nrow): + for j in range(ncol): + strVM += "%12.6f" % (reg.vm[i][j]) + strVM += "\n" + return strVM + +def _summary_iteration(reg): + """Reports the number of iterations computed and the type of estimator used for hom and het models.""" + try: + niter = reg.niter + except: + niter = reg.iteration + + txt = "%-20s:%12s\n" % ("N. of iterations", niter) + + try: + if reg.step1c: + step1c = "Yes" + else: + step1c = "No" + txt = txt[:-1] + " %-22s:%12s\n" % ( + "Step1c computed", + step1c, + ) + except: + pass + + try: + txt = txt[:-1] + " %-22s:%12s" % ( + "A1 type: ", + reg.A1, + ) + except: + pass + + return txt diff --git a/spreg/regimes.py b/spreg/regimes.py index fad91d71..d0c4db06 100755 --- a/spreg/regimes.py +++ b/spreg/regimes.py @@ -3,8 +3,6 @@ import scipy.sparse as SP import itertools as iter from scipy.stats import f, chi2 - -chisqprob = chi2.sf import itertools as iter import numpy.linalg as la from .utils import spbroadcast, set_warn @@ -186,6 +184,10 @@ class Regimes_Frame: If 'all' (default), all the variables vary by regime. names : None, list of strings Names of independent variables for use in output + yend : boolean + If True, the last columns of cols2regi are considered for endogenous variables + rlist : boolean + If True, returns a list with the regimes indicators for each variable Returns ------- @@ -212,7 +214,7 @@ class Regimes_Frame: """ - def __init__(self, x, regimes, constant_regi, cols2regi, names=None, yend=False): + def __init__(self, x, regimes, constant_regi, cols2regi, names=None, yend=False, rlist=False): if cols2regi == "all": cols2regi = [True] * x.shape[1] else: @@ -228,8 +230,8 @@ def __init__(self, x, regimes, constant_regi, cols2regi, names=None, yend=False) cols2regi.insert(0, True) else: raise Exception( - "Invalid argument (%s) passed for 'constant_regi'. Please secify a valid term." - % str(constant) + "Invalid argument (%s) passed for 'constant_regi'. Please specify a valid term." + % str(constant_regi) ) try: x = regimeX_setup( @@ -252,12 +254,18 @@ def __init__(self, x, regimes, constant_regi, cols2regi, names=None, yend=False) self.kryd = 0 self.nr = len(set(regimes)) - if names: - names = set_name_x_regimes( - names, regimes, constant_regi, cols2regi, self.regimes_set - ) - - return (x, names) + if rlist: + if names: + names, r_list = set_name_x_regimes( + names, regimes, constant_regi, cols2regi, self.regimes_set, rlist + ) + return (x, names, r_list) + else: + if names: + names = set_name_x_regimes( + names, regimes, constant_regi, cols2regi, self.regimes_set + ) + return (x, names) def wald_test(betas, r, q, vm): @@ -290,7 +298,7 @@ def wald_test(betas, r, q, vm): rvri = la.inv(np.dot(r, np.dot(vm, r.T))) w = np.dot(rbq.T, np.dot(rvri, rbq))[0][0] df = r.shape[0] - pvalue = chisqprob(w, df) + pvalue = chi2.sf(w, df) return w, pvalue @@ -425,7 +433,7 @@ def regimeX_setup(x, regimes, cols2regi, regimes_set, constant=False): return xsp -def set_name_x_regimes(name_x, regimes, constant_regi, cols2regi, regimes_set): +def set_name_x_regimes(name_x, regimes, constant_regi, cols2regi, regimes_set, rlist=False): """ Generate the set of variable names in a regimes setup, according to the order of the betas @@ -475,11 +483,17 @@ def set_name_x_regimes(name_x, regimes, constant_regi, cols2regi, regimes_set): vars_regi = nxa[np.where(c2ra == True)] vars_glob = nxa[np.where(c2ra == False)] name_x_regi = [] + r_list = [] for r in regimes_set: rl = ["%s_%s" % (str(r), i) for i in vars_regi] name_x_regi.extend(rl) + r_list.extend([str(r)] * len(rl)) name_x_regi.extend(["_Global_%s" % i for i in vars_glob]) - return name_x_regi + r_list.extend(["_Global"] * len(vars_glob)) + if rlist: + return (name_x_regi, r_list) + else: + return name_x_regi def w_regime(w, regi_ids, regi_i, transform=True, min_n=None): diff --git a/spreg/twosls.py b/spreg/twosls.py index 6a6bd68f..0fe355b1 100644 --- a/spreg/twosls.py +++ b/spreg/twosls.py @@ -1,11 +1,12 @@ import numpy as np import numpy.linalg as la -from . import summary_output as SUMMARY from . import robust as ROBUST from . import user_output as USER -from .utils import spdot, sphstack, RegressionPropsY, RegressionPropsVM, set_warn +from .utils import spdot, sphstack, RegressionPropsY, RegressionPropsVM, set_warn, get_lags +import pandas as pd +from .output import output, _spat_diag_out -__author__ = "Luc Anselin luc.anselin@asu.edu, David C. Folch david.folch@asu.edu, Jing Yao jingyao@asu.edu" +__author__ = "Luc Anselin lanselin@gmail.com, Pedro Amaral pedrovma@gmail.com, David C. Folch david.folch@asu.edu, Jing Yao jingyao@asu.edu" __all__ = ["TSLS"] @@ -123,10 +124,8 @@ class BaseTSLS(RegressionPropsY, RegressionPropsVM): >>> q.append(db.by_col("DISCBD")) >>> q = np.array(q).T >>> reg = spreg.twosls.BaseTSLS(y, X, yd, q=q) - >>> print(reg.betas) - [[88.46579584] - [ 0.5200379 ] - [-1.58216593]] + >>> print(reg.betas.T) + [[88.46579584 0.5200379 -1.58216593]] >>> reg = spreg.twosls.BaseTSLS(y, X, yd, q=q, robust="white") """ @@ -248,6 +247,9 @@ class TSLS(BaseTSLS): gwk : pysal W object Kernel spatial weights needed for HAC estimation. Note: matrix must have ones along the main diagonal. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SLX type. sig2n_k : boolean If True, then use n-k to estimate sigma^2. If False, use n. spat_diag : boolean @@ -269,10 +271,13 @@ class TSLS(BaseTSLS): Name of kernel weights matrix for use in output name_ds : string Name of dataset for use in output - + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -423,11 +428,8 @@ class TSLS(BaseTSLS): >>> from spreg import TSLS >>> reg = TSLS(y, X, yd, q, name_x=['inc'], name_y='crime', name_yend=['hoval'], name_q=['discbd'], name_ds='columbus') - >>> print(reg.betas) - [[88.46579584] - [ 0.5200379 ] - [-1.58216593]] - + >>> print(reg.betas.T) + [[88.46579584 0.5200379 -1.58216593]] """ def __init__( @@ -439,6 +441,7 @@ def __init__( w=None, robust=None, gwk=None, + slx_lags=0, sig2n_k=False, spat_diag=False, vm=False, @@ -449,14 +452,28 @@ def __init__( name_w=None, name_gwk=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x, yend, q) y = USER.check_y(y, n) - USER.check_weights(w, y) USER.check_robust(robust, gwk) + if robust == "hac" and spat_diag: + set_warn( + self, + "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", + ) + spat_diag = False USER.check_spat_diag(spat_diag, w) x_constant, name_x, warn = USER.check_constant(x, name_x) + self.name_x = USER.set_name_x(name_x, x_constant) + if slx_lags>0: + USER.check_weights(w, y, w_required=True) + lag_x = get_lags(w, x_constant[:, 1:], slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + self.name_x += USER.set_name_spatial_lags(self.name_x[1:], slx_lags) + else: + USER.check_weights(w, y, w_required=False) set_warn(self, warn) BaseTSLS.__init__( self, @@ -469,9 +486,10 @@ def __init__( sig2n_k=sig2n_k, ) self.title = "TWO STAGE LEAST SQUARES" + if slx_lags > 0: + self.title += " WITH SPATIALLY LAGGED X (2SLS-SLX)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) - self.name_x = USER.set_name_x(name_x, x_constant) self.name_yend = USER.set_name_yend(name_yend, yend) self.name_z = self.name_x + self.name_yend self.name_q = USER.set_name_q(name_q, q) @@ -479,8 +497,15 @@ def __init__( self.robust = USER.set_robust(robust) self.name_w = USER.set_name_w(name_w, w) self.name_gwk = USER.set_name_w(name_gwk, gwk) - SUMMARY.TSLS(reg=self, vm=vm, w=w, spat_diag=spat_diag) - + self.output = pd.DataFrame(self.name_x + self.name_yend, + columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * len(self.name_yend) + self.output['regime'], self.output['equation'] = (0, 0) + if spat_diag: + diag_out = _spat_diag_out(self, w, 'yend') + else: + diag_out = None + output(reg=self, vm=vm, robust=robust, other_end=diag_out, latex=latex) def _test(): import doctest @@ -521,4 +546,5 @@ def _test(): name_ds="columbus", name_w="columbus.gal", ) + print(tsls.output) print(tsls.summary) diff --git a/spreg/twosls_regimes.py b/spreg/twosls_regimes.py index 8c4ddfcb..50a36853 100644 --- a/spreg/twosls_regimes.py +++ b/spreg/twosls_regimes.py @@ -1,19 +1,18 @@ import numpy as np +import multiprocessing as mp +import pandas as pd from . import regimes as REGI from . import user_output as USER -import multiprocessing as mp -import scipy.sparse as SP -from .utils import sphstack, set_warn, RegressionProps_basic, spdot, sphstack +from .utils import set_warn, RegressionProps_basic, spdot, sphstack, get_lags from .twosls import BaseTSLS from .robust import hac_multi -from . import summary_output as SUMMARY -from platform import system +from .output import output, _spat_diag_out """ Two-stage Least Squares estimation with regimes. """ -__author__ = "Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu, David C. Folch david.folch@asu.edu" +__author__ = "Luc Anselin, Pedro V. Amaral, David C. Folch" class TSLS_Regimes(BaseTSLS, REGI.Regimes_Frame): @@ -65,6 +64,10 @@ class TSLS_Regimes(BaseTSLS, REGI.Regimes_Frame): gwk : pysal W object Kernel spatial weights needed for HAC estimation. Note: matrix must have ones along the main diagonal. + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the SLX type. + Note: WX is computed using the complete weights matrix sig2n_k : boolean If True, then use n-k to estimate sigma^2. If False, use n. vm : boolean @@ -89,9 +92,16 @@ class TSLS_Regimes(BaseTSLS, REGI.Regimes_Frame): Name of kernel weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe + summary : string + Summary of regression results and diagnostics (note: use in + conjunction with the print command) betas : array kx1 array of estimated coefficients u : array @@ -269,6 +279,62 @@ class TSLS_Regimes(BaseTSLS, REGI.Regimes_Frame): array([0.38389901, 0.09963973, 0.04672091, 0.22725012, 0.49181223, 0.19630774, 0.07784587, 0.25529011]) + >>> print(tslsr.summary) + REGRESSION RESULTS + ------------------ + + SUMMARY OF OUTPUT: TWO STAGE LEAST SQUARES ESTIMATION - REGIME 0 + ---------------------------------------------------------------- + Data set : NAT + Weights matrix : NAT.shp + Dependent Variable : 0_HR90 Number of Observations: 1673 + Mean dependent var : 3.3416 Number of Variables : 4 + S.D. dependent var : 4.6795 Degrees of Freedom : 1669 + Pseudo R-squared : 0.2092 + + ------------------------------------------------------------------------------------ + Variable Coefficient Std.Error z-Statistic Probability + ------------------------------------------------------------------------------------ + 0_CONSTANT 3.6697356 0.3838990 9.5591172 0.0000000 + 0_PS90 1.0695047 0.0996397 10.7337170 0.0000000 + 0_UE90 0.1468095 0.0467209 3.1422643 0.0016765 + 0_RD90 2.4586420 0.2272501 10.8191009 0.0000000 + ------------------------------------------------------------------------------------ + Instrumented: 0_RD90 + Instruments: 0_FP89 + Regimes variable: SOUTH + + SUMMARY OF OUTPUT: TWO STAGE LEAST SQUARES ESTIMATION - REGIME 1 + ---------------------------------------------------------------- + Data set : NAT + Weights matrix : NAT.shp + Dependent Variable : 1_HR90 Number of Observations: 1412 + Mean dependent var : 9.5493 Number of Variables : 4 + S.D. dependent var : 7.0389 Degrees of Freedom : 1408 + Pseudo R-squared : 0.2987 + + ------------------------------------------------------------------------------------ + Variable Coefficient Std.Error z-Statistic Probability + ------------------------------------------------------------------------------------ + 1_CONSTANT 9.5587324 0.4918122 19.4357356 0.0000000 + 1_PS90 1.9466635 0.1963077 9.9163867 0.0000000 + 1_UE90 -0.3081021 0.0778459 -3.9578483 0.0000756 + 1_RD90 3.6871812 0.2552901 14.4431026 0.0000000 + ------------------------------------------------------------------------------------ + Instrumented: 1_RD90 + Instruments: 1_FP89 + Regimes variable: SOUTH + ------------------------------------------------------------------------------------ + GLOBAL DIAGNOSTICS + + REGIMES DIAGNOSTICS - CHOW TEST + VARIABLE DF VALUE PROB + CONSTANT 1 89.093 0.0000 + PS90 1 15.876 0.0001 + UE90 1 25.106 0.0000 + RD90 1 12.920 0.0003 + Global test 4 201.237 0.0000 + ================================ END OF REPORT ===================================== """ def __init__( @@ -281,6 +347,7 @@ def __init__( w=None, robust=None, gwk=None, + slx_lags=0, sig2n_k=True, spat_diag=False, vm=False, @@ -297,16 +364,36 @@ def __init__( name_gwk=None, name_ds=None, summ=True, + latex=False, ): n = USER.check_arrays(y, x) y = USER.check_y(y, n) - USER.check_weights(w, y) USER.check_robust(robust, gwk) + if robust == "hac": + if regime_err_sep: + set_warn( + self, + "Error by regimes is not available for HAC estimation. The error by regimes has been disabled for this model.", + ) + regime_err_sep = False + if spat_diag: + set_warn( + self, + "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", + ) + spat_diag = False USER.check_spat_diag(spat_diag, w) x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) name_x = USER.set_name_x(name_x, x_constant, constant=True) + if slx_lags > 0: + USER.check_weights(w, y, w_required=True) + lag_x = get_lags(w, x_constant, slx_lags) + x_constant = np.hstack((x_constant, lag_x)) + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + else: + USER.check_weights(w, y, w_required=False) self.constant_regi = constant_regi self.cols2regi = cols2regi self.name_ds = USER.set_name_ds(name_ds) @@ -324,12 +411,6 @@ def __init__( self.regimes_set = REGI._get_regimes_set(regimes) self.regimes = regimes USER.check_regimes(self.regimes_set, self.n, x_constant.shape[1]) - if regime_err_sep == True and robust == "hac": - set_warn( - self, - "Error by regimes is incompatible with HAC estimation for 2SLS models. Hence, the error by regimes has been disabled for this model.", - ) - regime_err_sep = False self.regime_err_sep = regime_err_sep if ( regime_err_sep == True @@ -348,6 +429,7 @@ def __init__( regi_ids, cores, gwk, + slx_lags, sig2n_k, robust, spat_diag, @@ -355,20 +437,23 @@ def __init__( name_x, name_yend, name_q, + summ, + latex ) else: q, self.name_q = REGI.Regimes_Frame.__init__( self, q, regimes, constant_regi=None, cols2regi="all", names=name_q ) - x, self.name_x = REGI.Regimes_Frame.__init__( + x, self.name_x, x_rlist = REGI.Regimes_Frame.__init__( self, x_constant, regimes, constant_regi, cols2regi=cols2regi, names=name_x, + rlist=True ) - yend, self.name_yend = REGI.Regimes_Frame.__init__( + yend, self.name_yend, yend_rlist = REGI.Regimes_Frame.__init__( self, yend, regimes, @@ -376,13 +461,23 @@ def __init__( cols2regi=cols2regi, yend=True, names=name_yend, + rlist=True ) - if regime_err_sep == True and robust == None: - robust = "white" + self.output = pd.DataFrame(self.name_x+self.name_yend, + columns=['var_names']) + self.output['var_type'] = ['x']*len(self.name_x)+['yend']*len(self.name_yend) + self.output['regime'] = x_rlist+yend_rlist + self.output['equation'] = 0 + BaseTSLS.__init__( self, y=y, x=x, yend=yend, q=q, robust=robust, gwk=gwk, sig2n_k=sig2n_k ) - self.title = "TWO STAGE LEAST SQUARES - REGIMES" + + if slx_lags == 0: + self.title = "TWO STAGE LEAST SQUARES - REGIMES" + else: + self.title = "TWO STAGE LEAST SQUARES WITH SPATIALLY LAGGED X (2SLS-SLX) - REGIMES" + if robust == "ogmm": _optimal_weight(self, sig2n_k) self.name_z = self.name_x + self.name_yend @@ -390,7 +485,12 @@ def __init__( self.chow = REGI.Chow(self) self.robust = USER.set_robust(robust) if summ: - SUMMARY.TSLS(reg=self, vm=vm, w=w, spat_diag=spat_diag, regimes=True) + if spat_diag: + diag_out = _spat_diag_out(self, w, 'yend') + else: + diag_out = None + output(reg=self, vm=vm, robust=robust, other_end=diag_out, latex=latex) + def _tsls_regimes_multi( self, @@ -401,6 +501,7 @@ def _tsls_regimes_multi( regi_ids, cores, gwk, + slx_lags, sig2n_k, robust, spat_diag, @@ -408,6 +509,8 @@ def _tsls_regimes_multi( name_x, name_yend, name_q, + summ, + latex ): results_p = {} """ @@ -421,7 +524,7 @@ def _tsls_regimes_multi( is_win = False """ x_constant, name_x = REGI.check_const_regi(self, x, name_x, regi_ids) - self.name_x_r = name_x + self.name_x_r = name_x + name_yend for r in self.regimes_set: if cores: pool = mp.Pool(None) @@ -444,6 +547,7 @@ def _tsls_regimes_multi( name_q, self.name_w, self.name_regimes, + slx_lags ), ) else: @@ -465,6 +569,7 @@ def _tsls_regimes_multi( name_q, self.name_w, self.name_regimes, + slx_lags ) ) @@ -495,6 +600,7 @@ def _tsls_regimes_multi( self.name_h, ) = ([], [], [], [], [], []) counter = 0 + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -526,8 +632,13 @@ def _tsls_regimes_multi( self.name_q += results[r].name_q self.name_z += results[r].name_z self.name_h += results[r].name_h + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x+results[r].name_yend, + 'var_type': ['x']*len(results[r].name_x)+['yend']*len(results[r].name_yend), + 'regime': r, 'equation': r})], ignore_index=True) + counter += 1 self.multi = results + self.hac_var = sphstack(x_constant[:, 1:], q) if robust == "hac": hac_multi(self, gwk) @@ -539,9 +650,12 @@ def _tsls_regimes_multi( self.chow = REGI.Chow(self) if spat_diag: self._get_spat_diag_props(results, regi_ids, x_constant, yend, q) - SUMMARY.TSLS_multi( - reg=self, multireg=self.multi, vm=vm, spat_diag=spat_diag, regimes=True, w=w - ) + diag_out = _spat_diag_out(self, w, 'yend') + else: + diag_out = None + if summ: + self.output.sort_values(by='regime', inplace=True) + output(reg=self, vm=vm, robust=robust, other_end=diag_out, latex=latex) def _get_spat_diag_props(self, results, regi_ids, x, yend, q): self._cache = {} @@ -578,6 +692,7 @@ def _work( name_q, name_w, name_regimes, + slx_lags, ): y_r = y[regi_ids[r]] x_r = x[regi_ids[r]] @@ -588,7 +703,10 @@ def _work( else: robust2 = robust model = BaseTSLS(y_r, x_r, yend_r, q_r, robust=robust2, sig2n_k=sig2n_k) - model.title = "TWO STAGE LEAST SQUARES ESTIMATION - REGIME %s" % r + if slx_lags == 0: + model.title = "TWO STAGE LEAST SQUARES ESTIMATION - REGIME %s" % r + else: + model.title = "TWO STAGE LEAST SQUARES ESTIMATION WITH SLX - REGIME %s" % r if robust == "ogmm": _optimal_weight(model, sig2n_k, warn=False) model.robust = USER.set_robust(robust) @@ -628,7 +746,7 @@ def _optimal_weight(reg, sig2n_k, warn=True): else: vm = fac2 * reg.n RegressionProps_basic(reg, betas=betas, vm=vm, sig2=False) - reg.title += " (Optimal-Weighted GMM)" + #reg.title += " (Optimal-Weighted GMM)" if warn: set_warn( reg, "Residuals treated as homoskedastic for the purpose of diagnostics." @@ -663,20 +781,31 @@ def _test(): q = np.array([db.by_col(name) for name in q_var]).T r_var = "SOUTH" regimes = db.by_col(r_var) + w = libpysal.weights.Rook.from_shapefile(nat.get_path("natregimes.shp")) + w.transform = "r" tslsr = TSLS_Regimes( y, x, yd, q, regimes, + w = w, constant_regi="many", - spat_diag=False, + spat_diag=True, name_y=y_var, name_x=x_var, name_yend=yd_var, name_q=q_var, name_regimes=r_var, - cols2regi=[False, True, True, True], + #cols2regi=[False, True, True, False], sig2n_k=False, + regime_err_sep = True, + #robust = 'hac', + vm = False ) + print(tslsr.output) print(tslsr.summary) + + + + diff --git a/spreg/twosls_sp.py b/spreg/twosls_sp.py index 47803d69..e5ff8552 100755 --- a/spreg/twosls_sp.py +++ b/spreg/twosls_sp.py @@ -7,8 +7,9 @@ import numpy as np from . import twosls as TSLS from . import user_output as USER -from . import summary_output as SUMMARY from .utils import set_endog, sp_att, set_warn +import pandas as pd +from .output import output, _spat_diag_out, _spat_pseudo_r2 __all__ = ["GM_Lag"] @@ -44,6 +45,9 @@ class BaseGM_Lag(TSLS.BaseTSLS): lag_q : boolean If True, then include spatial lags of the additional instruments (q). + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the Spatial Durbin type. robust : string If 'white', then a White consistent estimator of the variance-covariance matrix is given. If 'hac', then a @@ -172,15 +176,19 @@ def __init__( q=None, w=None, w_lags=1, + slx_lags=0, lag_q=True, robust=None, gwk=None, sig2n_k=False, ): - yend2, q2 = set_endog( - y, x[:, 1:], w, yend, q, w_lags, lag_q - ) # assumes constant in first column + if slx_lags > 0: + yend2, q2, wx = set_endog(y, x[:, 1:], w, yend, q, w_lags, lag_q, slx_lags) + x = np.hstack((x, wx)) + else: + yend2, q2 = set_endog(y, x[:, 1:], w, yend, q, w_lags, lag_q) + TSLS.BaseTSLS.__init__( self, y=y, x=x, yend=yend2, q=q2, robust=robust, gwk=gwk, sig2n_k=sig2n_k ) @@ -216,6 +224,9 @@ class GM_Lag(BaseGM_Lag): lag_q : boolean If True, then include spatial lags of the additional instruments (q). + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the Spatial Durbin type. robust : string If 'white', then a White consistent estimator of the variance-covariance matrix is given. If 'hac', then a @@ -245,9 +256,13 @@ class GM_Lag(BaseGM_Lag): Name of kernel weights matrix for use in output name_ds : string Name of dataset for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -478,6 +493,7 @@ def __init__( w=None, w_lags=1, lag_q=True, + slx_lags=0, robust=None, gwk=None, sig2n_k=False, @@ -490,13 +506,24 @@ def __init__( name_w=None, name_gwk=None, name_ds=None, + latex=False, ): n = USER.check_arrays(x, yend, q) y = USER.check_y(y, n) USER.check_weights(w, y, w_required=True) USER.check_robust(robust, gwk) + if robust == "hac" and spat_diag: + set_warn( + self, + "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", + ) + spat_diag = False x_constant, name_x, warn = USER.check_constant(x, name_x) + + if slx_lags > 0: + name_x += USER.set_name_spatial_lags(name_x, slx_lags) + set_warn(self, warn) BaseGM_Lag.__init__( self, @@ -506,6 +533,7 @@ def __init__( yend=yend, q=q, w_lags=w_lags, + slx_lags=slx_lags, robust=robust, gwk=gwk, lag_q=lag_q, @@ -517,20 +545,38 @@ def __init__( ) set_warn(self, warn) self.title = "SPATIAL TWO STAGE LEAST SQUARES" + if slx_lags > 0: + self.title += " WITH SLX (SPATIAL DURBIN MODEL)" self.name_ds = USER.set_name_ds(name_ds) self.name_y = USER.set_name_y(name_y) - self.name_x = USER.set_name_x(name_x, x_constant) + self.name_x = USER.set_name_x(name_x, x_constant) # name_x contains SLX terms for slx_lags > 0 self.name_yend = USER.set_name_yend(name_yend, yend) self.name_yend.append(USER.set_name_yend_sp(self.name_y)) self.name_z = self.name_x + self.name_yend self.name_q = USER.set_name_q(name_q, q) - self.name_q.extend(USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q)) + if slx_lags > 0: # need to remove all but last SLX variables from name_x + self.name_x0 = [] + self.name_x0.append(self.name_x[0]) # constant +# print(f"x0 first {self.name_x0}") + kx = int((self.k -self.kstar -1)/(slx_lags +1) ) # number of original exogenous vars + self.name_x0.extend(self.name_x[-kx:]) +# print(f"in here {self.name_x0}") + self.name_q.extend(USER.set_name_q_sp(self.name_x0, w_lags, self.name_q, lag_q)) + else: + self.name_q.extend(USER.set_name_q_sp(self.name_x, w_lags, self.name_q, lag_q)) self.name_h = USER.set_name_h(self.name_x, self.name_q) self.robust = USER.set_robust(robust) self.name_w = USER.set_name_w(name_w, w) self.name_gwk = USER.set_name_w(name_gwk, gwk) - SUMMARY.GM_Lag(reg=self, w=w, vm=vm, spat_diag=spat_diag) - + self.output = pd.DataFrame(self.name_x + self.name_yend, columns=['var_names']) + self.output['var_type'] = ['x'] * len(self.name_x) + ['yend'] * (len(self.name_yend)-1) + ['rho'] + self.output['regime'], self.output['equation'] = (0, 0) + self.other_top = _spat_pseudo_r2(self) + if spat_diag: + diag_out = _spat_diag_out(self, w, 'yend') + else: + diag_out = None + output(reg=self, vm=vm, robust=robust, other_end=diag_out, latex=latex) def _test(): import doctest @@ -572,4 +618,5 @@ def _test(): name_ds="columbus", name_w="columbus.gal", ) + print(model.output) print(model.summary) diff --git a/spreg/twosls_sp_regimes.py b/spreg/twosls_sp_regimes.py index cfc0bda7..db0b23c3 100644 --- a/spreg/twosls_sp_regimes.py +++ b/spreg/twosls_sp_regimes.py @@ -7,12 +7,13 @@ import numpy as np from . import regimes as REGI from . import user_output as USER -from . import summary_output as SUMMARY import multiprocessing as mp from .twosls_regimes import TSLS_Regimes, _optimal_weight from .twosls import BaseTSLS from .utils import set_endog, set_endog_sparse, sp_att, set_warn, sphstack, spdot from .robust import hac_multi +import pandas as pd +from .output import output, _spat_diag_out, _spat_pseudo_r2 class GM_Lag_Regimes(TSLS_Regimes, REGI.Regimes_Frame): @@ -62,6 +63,9 @@ class GM_Lag_Regimes(TSLS_Regimes, REGI.Regimes_Frame): lag_q : boolean If True, then include spatial lags of the additional instruments (q). + slx_lags : integer + Number of spatial lags of X to include in the model specification. + If slx_lags>0, the specification becomes of the Spatial Durbin type. regime_lag_sep: boolean If True (default), the spatial parameter for spatial lag is also computed according to different regimes. If False, @@ -107,9 +111,13 @@ class GM_Lag_Regimes(TSLS_Regimes, REGI.Regimes_Frame): Name of dataset for use in output name_regimes : string Name of regimes variable for use in output + latex : boolean + Specifies if summary is to be printed in latex format Attributes ---------- + output : dataframe + regression results pandas dataframe summary : string Summary of regression results and diagnostics (note: use in conjunction with the print command) @@ -378,15 +386,16 @@ class GM_Lag_Regimes(TSLS_Regimes, REGI.Regimes_Frame): models, the argument regime_lag_sep must be set to True: >>> model=GM_Lag_Regimes(y, x, regimes, w=w, regime_lag_sep=True, name_y=y_var, name_x=x_var, name_regimes=r_var, name_ds='NAT', name_w='NAT.shp') - >>> print(np.hstack((np.array(model.name_z).reshape(8,1),model.betas,np.sqrt(model.vm.diagonal().reshape(8,1))))) - [['0_CONSTANT' '1.3658476998618099' '0.3985472089832652'] - ['0_PS90' '0.8087573074246643' '0.11324884794883601'] - ['0_UE90' '0.5694681319188577' '0.04625087717092595'] - ['0_W_HR90' '-0.43424389464634316' '0.13350159258670305'] - ['1_CONSTANT' '7.90731073341874' '1.6360187416950998'] - ['1_PS90' '1.2746570332609135' '0.2470987049452741'] - ['1_UE90' '0.6016769336173784' '0.07993322102145078'] - ['1_W_HR90' '-0.2960338343846942' '0.19934459782427025']] + >>> print(model.output) + var_names coefficients std_err zt_stat prob + 0 0_CONSTANT 1.365848 0.398547 3.427066 0.00061 + 1 0_PS90 0.808757 0.113249 7.141418 0.0 + 2 0_UE90 0.569468 0.046251 12.312591 0.0 + 3 0_W_HR90 -0.434244 0.133502 -3.252724 0.001143 + 4 1_CONSTANT 7.907311 1.636019 4.833264 0.000001 + 5 1_PS90 1.274657 0.247099 5.158493 0.0 + 6 1_UE90 0.601677 0.079933 7.527245 0.0 + 7 1_W_HR90 -0.296034 0.199345 -1.485036 0.137534 Alternatively, we can type: 'model.summary' to see the organized results output. The class is flexible enough to accomodate a spatial lag model that, @@ -432,6 +441,7 @@ def __init__( q=None, w=None, w_lags=1, + slx_lags=0, lag_q=True, robust=None, gwk=None, @@ -439,7 +449,7 @@ def __init__( spat_diag=False, constant_regi="many", cols2regi="all", - regime_lag_sep=False, + regime_lag_sep=True, regime_err_sep=True, cores=False, vm=False, @@ -451,12 +461,32 @@ def __init__( name_w=None, name_gwk=None, name_ds=None, + latex=False, ): n = USER.check_arrays(y, x) y = USER.check_y(y, n) USER.check_weights(w, y, w_required=True) USER.check_robust(robust, gwk) + if regime_lag_sep and not regime_err_sep: + set_warn(self, "regime_err_sep set to True when regime_lag_sep=True.") + regime_err_sep = True + if regime_err_sep and not regime_lag_sep: + set_warn(self, "Groupwise heteroskedasticity is not currently available for this method,\n so regime_err_sep has been set to False.") + regime_err_sep = False + if robust == "hac": + if regime_err_sep: + set_warn( + self, + "Error by regimes is not available for HAC estimation. The error by regimes has been disabled for this model.", + ) + regime_err_sep = False + if spat_diag: + set_warn( + self, + "Spatial diagnostics are not available for HAC estimation. The spatial diagnostics have been disabled for this model.", + ) + spat_diag = False USER.check_spat_diag(spat_diag, w) x_constant, name_x, warn = USER.check_constant(x, name_x, just_rem=True) set_warn(self, warn) @@ -464,12 +494,20 @@ def __init__( name_y = USER.set_name_y(name_y) name_yend = USER.set_name_yend(name_yend, yend) name_q = USER.set_name_q(name_q, q) - name_q.extend(USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True)) self.name_regimes = USER.set_name_ds(name_regimes) self.constant_regi = constant_regi - self.n = n + if slx_lags > 0: + yend2, q2, wx = set_endog(y, x_constant, w, yend, q, w_lags, lag_q, slx_lags) + x_constant = np.hstack((x_constant, wx)) + name_slx = USER.set_name_spatial_lags(name_x, slx_lags) + name_q.extend(USER.set_name_q_sp(name_slx[-len(name_x):], w_lags, name_q, lag_q, force_all=True)) + name_x += name_slx + else: + name_q.extend(USER.set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=True)) + yend2, q2 = yend, q + self.n = x_constant.shape[0] cols2regi = REGI.check_cols2regi( - constant_regi, cols2regi, x_constant, yend=yend, add_cons=False + constant_regi, cols2regi, x_constant, yend=yend2, add_cons=False ) self.cols2regi = cols2regi self.regimes_set = REGI._get_regimes_set(regimes) @@ -485,8 +523,6 @@ def __init__( self.regime_err_sep = regime_err_sep self.regime_lag_sep = regime_lag_sep if regime_lag_sep == True: - if not regime_err_sep: - raise Exception("regime_err_sep must be True when regime_lag_sep=True.") cols2regi += [True] w_i, regi_ids, warn = REGI.w_regimes( w, @@ -513,9 +549,10 @@ def __init__( w_i, w, regi_ids, - yend=yend, - q=q, + yend=yend2, + q=q2, w_lags=w_lags, + slx_lags=slx_lags, lag_q=lag_q, cores=cores, robust=robust, @@ -532,11 +569,13 @@ def __init__( name_w=name_w, name_gwk=name_gwk, name_ds=name_ds, + latex=latex, ) else: if regime_lag_sep == True: w = REGI.w_regimes_union(w, w_i, self.regimes_set) - yend2, q2 = set_endog(y, x_constant, w, yend, q, w_lags, lag_q) + if slx_lags == 0: + yend2, q2 = set_endog(y, x_constant, w, yend2, q2, w_lags, lag_q) name_yend.append(USER.set_name_yend_sp(name_y)) TSLS_Regimes.__init__( self, @@ -564,17 +603,26 @@ def __init__( name_ds=name_ds, summ=False, ) - if regime_lag_sep: + + if regime_lag_sep: # not currently available. self.sp_att_reg(w_i, regi_ids, yend2[:, -1].reshape(self.n, 1)) else: self.rho = self.betas[-1] + self.output.iat[-1, self.output.columns.get_loc('var_type')] = 'rho' self.predy_e, self.e_pred, warn = sp_att( w, self.y, self.predy, yend2[:, -1].reshape(self.n, 1), self.rho ) set_warn(self, warn) self.regime_lag_sep = regime_lag_sep self.title = "SPATIAL " + self.title - SUMMARY.GM_Lag(reg=self, w=w, vm=vm, spat_diag=spat_diag, regimes=True) + if slx_lags > 0: + self.title = " SPATIAL 2SLS WITH SLX (SPATIAL DURBIN MODEL) - REGIMES" + self.other_top = _spat_pseudo_r2(self) + if spat_diag: + diag_out = _spat_diag_out(self, w, 'yend') + else: + diag_out = None + output(reg=self, vm=vm, robust=robust, other_end=diag_out, latex=latex) def GM_Lag_Regimes_Multi( self, @@ -587,6 +635,7 @@ def GM_Lag_Regimes_Multi( yend=None, q=None, w_lags=1, + slx_lags=0, lag_q=True, robust=None, gwk=None, @@ -602,6 +651,7 @@ def GM_Lag_Regimes_Multi( name_w=None, name_gwk=None, name_ds=None, + latex=False, ): # pool = mp.Pool(cores) self.name_ds = USER.set_name_ds(name_ds) @@ -636,6 +686,7 @@ def GM_Lag_Regimes_Multi( q, w_r, w_lags, + slx_lags, lag_q, robust, sig2n_k, @@ -659,6 +710,7 @@ def GM_Lag_Regimes_Multi( q, w_r, w_lags, + slx_lags, lag_q, robust, sig2n_k, @@ -702,6 +754,7 @@ def GM_Lag_Regimes_Multi( self.name_h, ) = ([], [], [], [], [], []) counter = 0 + self.output = pd.DataFrame(columns=['var_names', 'var_type', 'regime', 'equation']) for r in self.regimes_set: """ if is_win: @@ -752,6 +805,14 @@ def GM_Lag_Regimes_Multi( self.hac_var[ regi_ids[r], ] = results[r].h + results[r].other_top = _spat_pseudo_r2(results[r]) + results[r].other_mid = "" + if spat_diag: + results[r].other_mid += _spat_diag_out(results[r], results[r].w, 'yend') + self.output = pd.concat([self.output, pd.DataFrame({'var_names': results[r].name_x + results[r].name_yend, + 'var_type': ['x'] * len(results[r].name_x) + [ + 'yend'] * (len(results[r].name_yend)-1) + ['rho'], + 'regime': r, 'equation': r})], ignore_index=True) counter += 1 self.multi = results if robust == "hac": @@ -763,11 +824,9 @@ def GM_Lag_Regimes_Multi( ) self.chow = REGI.Chow(self) if spat_diag: - pass # self._get_spat_diag_props(y, x, w, yend, q, w_lags, lag_q) - SUMMARY.GM_Lag_multi( - reg=self, multireg=self.multi, vm=vm, spat_diag=spat_diag, regimes=True, w=w - ) + pass + output(reg=self, vm=vm, robust=robust, other_end=False, latex=latex) def sp_att_reg(self, w_i, regi_ids, wy): predy_e_r, e_pred_r = {}, {} @@ -823,6 +882,7 @@ def _work( q, w_r, w_lags, + slx_lags, lag_q, robust, sig2n_k, @@ -844,14 +904,18 @@ def _work( q_r = q[regi_ids[r]] else: q_r = q - yend_r, q_r = set_endog_sparse(y_r, x_r[:, 1:], w_r, yend_r, q_r, w_lags, lag_q) + if slx_lags == 0: + yend_r, q_r = set_endog_sparse(y_r, x_r[:, 1:], w_r, yend_r, q_r, w_lags, lag_q) + title = "SPATIAL TWO STAGE LEAST SQUARES ESTIMATION - REGIME %s" % r + else: + title = "SPATIAL 2SLS WITH SLX (SPATIAL DURBIN MODEL) - REGIME %s" % r # x_constant = USER.check_constant(x_r) if robust == "hac" or robust == "ogmm": robust2 = None else: robust2 = robust model = BaseTSLS(y_r, x_r, yend_r, q_r, robust=robust2, sig2n_k=sig2n_k) - model.title = "SPATIAL TWO STAGE LEAST SQUARES ESTIMATION - REGIME %s" % r + model.title = title if robust == "ogmm": _optimal_weight(model, sig2n_k, warn=False) model.rho = model.betas[-1] @@ -917,6 +981,8 @@ def _test(): name_ds="columbus", name_w="columbus.gal", regime_err_sep=True, + regime_lag_sep = True, robust="white", ) + print(model.output) print(model.summary) diff --git a/spreg/user_output.py b/spreg/user_output.py index ffdb2324..cf7f9e68 100755 --- a/spreg/user_output.py +++ b/spreg/user_output.py @@ -1,7 +1,8 @@ """Internal helper files for user output.""" __author__ = ( - "Luc Anselin luc.anselin@asu.edu, " + "Luc Anselin lanselin@gmail.com, " + "Pedro Amaral pedrovma@gmail.com" "David C. Folch david.folch@asu.edu, " "Levi John Wolf levi.john.wolf@gmail.com, " "Jing Yao jingyao@asu.edu" @@ -147,6 +148,23 @@ def set_name_yend_sp(name_y): """ return "W_" + name_y +def set_name_spatial_lags(names, w_lags): + """Set the spatial lag names for multiple variables and lag orders" + + Parameters + ---------- + names : string + Original variables' names. + + Returns + ------- + lag_names : string + + """ + lag_names = ["W_" + s for s in names] + for i in range(w_lags-1): + lag_names += ["W" + str(i+2) + "_" + s for s in names] + return lag_names def set_name_q_sp(name_x, w_lags, name_q, lag_q, force_all=False): """Set the spatial instrument names in regression; return generic name if user @@ -544,8 +562,9 @@ def check_robust(robust, wk): # NOTE: we are not checking for the case of exactly 1.0 ### raise Exception("Off-diagonal entries must be less than 1.") elif robust.lower() == "white" or robust.lower() == "ogmm": - if wk: - raise Exception("White requires that wk be set to None") + # if wk: + # raise Exception("White requires that wk be set to None") + pass # these options are not affected by wk else: raise Exception( "invalid value passed to robust, see docs for valid options" diff --git a/spreg/utils.py b/spreg/utils.py index ff4f6d2c..c369c74d 100755 --- a/spreg/utils.py +++ b/spreg/utils.py @@ -479,7 +479,7 @@ def get_lags(w, x, w_lags): Returns -------- rs : array - nxk*(w_lags+1) array with original and spatially lagged variables + nxk*(w_lags) array with spatially lagged variables """ lag = lag_spatial(w, x) @@ -489,6 +489,43 @@ def get_lags(w, x, w_lags): spat_lags = sphstack(spat_lags, lag) return spat_lags +def get_lags_split(w, x, max_lags, split_at): + """ + Calculates a given order of spatial lags and all the smaller orders, + separated into two groups (up to split_at and above) + + Parameters + ---------- + w : weight + PySAL weights instance + x : array + nxk arrays with the variables to be lagged + max_lags : integer + Maximum order of spatial lag + split_at: integer + Separates the resulting lags into two groups: up to split_at and above + + Returns + -------- + rs_l,rs_h: tuple of arrays + rs_l: nxk*(split_at) array with spatially lagged variables up to split_at + rs_h: nxk*(w_lags-split_at) array with spatially lagged variables above split_at + + """ + rs_l = lag = lag_spatial(w, x) + rs_h = None + if 0 < split_at < max_lags: + for _ in range(split_at-1): + lag = lag_spatial(w, lag) + rs_l = sphstack(rs_l, lag) + + for i in range(max_lags - split_at): + lag = lag_spatial(w, lag) + rs_h = sphstack(rs_h, lag) if i > 0 else lag + else: + raise ValueError("max_lags must be greater than split_at and split_at must be greater than 0") + + return rs_l, rs_h def inverse_prod( w, @@ -571,9 +608,11 @@ def inverse_prod( except: matrix = la.inv(np.eye(w.shape[0]) - (scalar * w)) if post_multiply: - inv_prod = spdot(data.T, matrix) +# inv_prod = spdot(data.T, matrix) + inv_prod = np.matmul(data.T,matrix) # inverse matrix is dense, wrong type in spdot else: - inv_prod = spdot(matrix, data) +# inv_prod = spdot(matrix, data) + inv_prod = np.matmul(matrix,data) else: raise Exception("Invalid method selected for inversion.") return inv_prod @@ -623,24 +662,35 @@ def power_expansion( return running_total -def set_endog(y, x, w, yend, q, w_lags, lag_q): +def set_endog(y, x, w, yend, q, w_lags, lag_q, slx_lags=0): # Create spatial lag of y yl = lag_spatial(w, y) # spatial and non-spatial instruments if issubclass(type(yend), np.ndarray): + if slx_lags > 0: + lag_x, lag_xq = get_lags_split(w, x, slx_lags+1, slx_lags) + else: + lag_xq = x if lag_q: - lag_vars = sphstack(x, q) + lag_vars = sphstack(lag_xq, q) else: - lag_vars = x + lag_vars = lag_xq spatial_inst = get_lags(w, lag_vars, w_lags) q = sphstack(q, spatial_inst) yend = sphstack(yend, yl) elif yend == None: # spatial instruments only - q = get_lags(w, x, w_lags) + if slx_lags > 0: + lag_x, lag_xq = get_lags_split(w, x, slx_lags+w_lags, slx_lags) + else: + lag_xq = get_lags(w, x, w_lags) + q = lag_xq yend = yl else: raise Exception("invalid value passed to yend") - return yend, q + if slx_lags == 0: + return yend, q + else: + return yend, q, lag_x lag = lag_spatial(w, x) spat_lags = lag