diff --git a/src/tabmat/constructor.py b/src/tabmat/constructor.py index c7e82f0e..c87b211f 100644 --- a/src/tabmat/constructor.py +++ b/src/tabmat/constructor.py @@ -18,9 +18,68 @@ from .sparse_matrix import SparseMatrix from .split_matrix import SplitMatrix +try: + import polars as pl +except ImportError: + pl = None +try: + import pandas as pd +except ImportError: + pd = None -def from_pandas( + +def _is_numeric(series, engine: str): + if engine == "pandas": + return pd.api.types.is_numeric_dtype(series) + elif engine == "polars": + return series.dtype.is_numeric() + else: + raise ValueError(f"Unknown engine: {engine}") + + +def _iter_columns(df, engine: str): + if engine == "pandas": + return df.items() + elif engine == "polars": + return ((col.name, col) for col in df.iter_columns()) + else: + raise ValueError(f"Unknown engine: {engine}") + + +def _object_as_cat(series, engine: str): + if engine == "pandas": + if series.dtype == object: + return series.astype("category") + return series + elif engine == "polars": + if series.dtype == pl.String: + return series.cast(pl.Categorical) + return series + else: + raise ValueError(f"Unknown engine: {engine}") + + +def _is_categorical(series, engine: str): + if engine == "pandas": + return isinstance(series.dtype, pd.CategoricalDtype) + elif engine == "polars": + return isinstance(series.dtype, (pl.Categorical, pl.Enum)) + else: + raise ValueError(f"Unknown engine: {engine}") + + +def _select_cols(df, idx, engine): + if engine == "pandas": + return df.iloc[:, idx] + elif engine == "polars": + return df.select(pl.nth(idx)) + else: + raise ValueError(f"Unknown engine: {engine}") + + +def _from_dataframe( df, + engine: str, dtype: np.dtype = np.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, @@ -32,63 +91,27 @@ def from_pandas( cat_missing_name: str = "(MISSING)", ) -> MatrixBase: """ - Transform a pandas.DataFrame into an efficient SplitMatrix. - - Parameters - ---------- - df : pd.DataFrame - pandas DataFrame to convert. - dtype : np.dtype, default np.float64 - dtype of all sub-matrices of the resulting SplitMatrix. - sparse_threshold : float, default 0.1 - Density threshold below which numerical columns will be stored in a sparse - format. - cat_threshold : int, default 4 - Number of levels of a categorical column under which the column will be stored - as sparse one-hot-encoded columns instead of CategoricalMatrix - object_as_cat : bool, default False - If True, DataFrame columns stored as python objects will be treated as - categorical columns. - cat_position : str {'end'|'expand'}, default 'expand' - Position of the categorical variable in the index. If "last", all the - categoricals (including the ones that did not satisfy cat_threshold) - will be placed at the end of the index list. If "expand", all the variables - will remain in the same order. - drop_first : bool, default False - If true, categoricals variables will have their first category dropped. - This allows multiple categorical variables to be included in an - unregularized model. If False, all categories are included. - cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' - How to handle missing values in categorical columns: - - if 'fail', raise an error if there are missing values. - - if 'zero', missing values will represent all-zero indicator columns. - - if 'convert', missing values will be converted to the '(MISSING)' category. - cat_missing_name: str, default '(MISSING)' - Name of the category to which missing values will be converted if - ``cat_missing_method='convert'``. + See docstring of from_pandas or from_polars for details. - Returns - ------- - SplitMatrix + engine should be either 'pandas' or 'polars'. """ - import pandas as pd matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = [] indices: list[list[int]] = [] is_cat: list[bool] = [] - dense_columns = [] # column index in original DataFrame - dense_indices = [] # index in the new SplitMatrix - sparse_columns = [] # sparse columns to join together - sparse_indices = [] # index in the new SplitMatrix + dense_dfidx = [] # column index in original DataFrame + dense_tmidx = [] # index in the new SplitMatrix + sparse_dfidx = [] # column index in the original DataFrame + sparse_tmidx = [] # index in the new SplitMatrix ignored_cols = [] mxcolidx = 0 - for colname, coldata in df.items(): - if object_as_cat and coldata.dtype == object: - coldata = coldata.astype("category") - if isinstance(coldata.dtype, pd.CategoricalDtype): + for dfcolidx, (colname, coldata) in enumerate(_iter_columns(df, engine)): + if object_as_cat: + coldata = _object_as_cat(coldata, engine) + if _is_categorical(coldata, engine): cat = CategoricalMatrix( coldata, drop_first=drop_first, @@ -131,14 +154,14 @@ def from_pandas( mxcolidx += cat.shape[1] elif cat_position == "end": indices.append(np.arange(cat.shape[1])) - elif pd.api.types.is_numeric_dtype(coldata): + elif _is_numeric(coldata, engine): if (coldata != 0).mean() <= sparse_threshold: - sparse_columns.append(colname) - sparse_indices.append(mxcolidx) + sparse_dfidx.append(dfcolidx) + sparse_tmidx.append(mxcolidx) mxcolidx += 1 else: - dense_columns.append(colname) - dense_indices.append(mxcolidx) + dense_dfidx.append(dfcolidx) + dense_tmidx.append(mxcolidx) mxcolidx += 1 else: @@ -148,13 +171,26 @@ def from_pandas( warnings.warn( f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." ) - if dense_columns: - matrices.append(_dense_matrix(df, dense_columns, dtype)) - indices.append(dense_indices) + if dense_dfidx: + matrices.append( + DenseMatrix( + _select_cols(df, dense_dfidx, engine).to_numpy().astype(dtype), + column_names=np.asarray(df.columns)[dense_dfidx], + term_names=np.asarray(df.columns)[dense_dfidx], + ) + ) + indices.append(dense_tmidx) is_cat.append(False) - if sparse_columns: - matrices.append(_sparse_matrix(df, sparse_columns, dtype)) - indices.append(sparse_indices) + if sparse_dfidx: + matrices.append( + SparseMatrix( + sps.coo_matrix(_select_cols(df, sparse_dfidx, engine), dtype=dtype), + dtype=dtype, + column_names=np.asarray(df.columns)[sparse_dfidx], + term_names=np.asarray(df.columns)[sparse_dfidx], + ) + ) + indices.append(sparse_tmidx) is_cat.append(False) if cat_position == "end": @@ -168,11 +204,12 @@ def from_pandas( return matrices[0] -def from_polars( +def from_pandas( df, dtype: np.dtype = np.float64, sparse_threshold: float = 0.1, cat_threshold: int = 4, + object_as_cat: bool = False, cat_position: str = "expand", drop_first: bool = False, categorical_format: str = "{name}[{category}]", @@ -180,12 +217,12 @@ def from_polars( cat_missing_name: str = "(MISSING)", ) -> MatrixBase: """ - Transform a polars.DataFrame into an efficient SplitMatrix. + Transform a pandas.DataFrame into an efficient SplitMatrix. Parameters ---------- - df : pl.DataFrame - Polars DataFrame to convert. + df : pd.DataFrame + pandas DataFrame to convert. dtype : np.dtype, default np.float64 dtype of all sub-matrices of the resulting SplitMatrix. sparse_threshold : float, default 0.1 @@ -194,6 +231,9 @@ def from_polars( cat_threshold : int, default 4 Number of levels of a categorical column under which the column will be stored as sparse one-hot-encoded columns instead of CategoricalMatrix + object_as_cat : bool, default False + If True, DataFrame columns stored as python objects will be treated as + categorical columns. cat_position : str {'end'|'expand'}, default 'expand' Position of the categorical variable in the index. If "last", all the categoricals (including the ones that did not satisfy cat_threshold) @@ -216,106 +256,86 @@ def from_polars( ------- SplitMatrix """ - import polars as pl - - matrices: list[Union[DenseMatrix, SparseMatrix, CategoricalMatrix]] = [] - indices: list[list[int]] = [] - is_cat: list[bool] = [] - - dense_columns = [] # column index in original DataFrame - dense_indices = [] # index in the new SplitMatrix - sparse_columns = [] # sparse columns to join together - sparse_indices = [] # index in the new SplitMatrix - ignored_cols = [] - - mxcolidx = 0 - - for coldata in df.iter_columns(): - if isinstance(coldata.dtype, (pl.Categorical, pl.Enum)): - cat = CategoricalMatrix( - coldata, - drop_first=drop_first, - dtype=dtype, - column_name=coldata.name, - term_name=coldata.name, - column_name_format=categorical_format, - cat_missing_method=cat_missing_method, - cat_missing_name=cat_missing_name, - ) - if len(cat.categories) < cat_threshold: - ( - X_dense_F, - X_sparse, - dense_idx, - sparse_idx, - ) = _split_sparse_and_dense_parts( - sps.csc_matrix(cat.tocsr(), dtype=dtype), - threshold=sparse_threshold, - column_names=cat.get_names("column"), - term_names=cat.get_names("term"), - ) - matrices.append(X_dense_F) - is_cat.append(True) - matrices.append(X_sparse) - is_cat.append(True) - if cat_position == "expand": - indices.append(mxcolidx + dense_idx) - indices.append(mxcolidx + sparse_idx) - mxcolidx += len(dense_idx) + len(sparse_idx) - elif cat_position == "end": - indices.append(dense_idx) - indices.append(sparse_idx) - - else: - matrices.append(cat) - is_cat.append(True) - if cat_position == "expand": - indices.append(mxcolidx + np.arange(cat.shape[1])) - mxcolidx += cat.shape[1] - elif cat_position == "end": - indices.append(np.arange(cat.shape[1])) - elif coldata.dtype.is_numeric(): - if (coldata != 0).mean() <= sparse_threshold: - sparse_columns.append(coldata.name) - sparse_indices.append(mxcolidx) - mxcolidx += 1 - else: - dense_columns.append(coldata.name) - dense_indices.append(mxcolidx) - mxcolidx += 1 - - else: - ignored_cols.append(coldata.name) - - if len(ignored_cols) > 0: - warnings.warn( - f"Columns {ignored_cols} were ignored. Make sure they have a valid dtype." - ) - if dense_columns: - matrices.append(_dense_matrix(df, dense_columns, dtype)) - indices.append(dense_indices) - is_cat.append(False) - if sparse_columns: - matrices.append(_sparse_matrix(df, sparse_columns, dtype)) - indices.append(sparse_indices) - is_cat.append(False) + return _from_dataframe( + df, + engine="pandas", + dtype=dtype, + sparse_threshold=sparse_threshold, + cat_threshold=cat_threshold, + object_as_cat=object_as_cat, + cat_position=cat_position, + drop_first=drop_first, + categorical_format=categorical_format, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, + ) - if cat_position == "end": - indices = _reindex_cat(indices, is_cat, mxcolidx) - if len(matrices) > 1: - return SplitMatrix(matrices, indices) - elif len(matrices) == 0: - raise ValueError("DataFrame contained no valid column") - else: - return matrices[0] +def from_polars( + df, + dtype: np.dtype = np.float64, + sparse_threshold: float = 0.1, + cat_threshold: int = 4, + object_as_cat: bool = False, + cat_position: str = "expand", + drop_first: bool = False, + categorical_format: str = "{name}[{category}]", + cat_missing_method: str = "fail", + cat_missing_name: str = "(MISSING)", +) -> MatrixBase: + """ + Transform a polars.DataFrame into an efficient SplitMatrix. + Parameters + ---------- + df : pl.DataFrame + Polars DataFrame to convert. + dtype : np.dtype, default np.float64 + dtype of all sub-matrices of the resulting SplitMatrix. + sparse_threshold : float, default 0.1 + Density threshold below which numerical columns will be stored in a sparse + format. + cat_threshold : int, default 4 + Number of levels of a categorical column under which the column will be stored + as sparse one-hot-encoded columns instead of CategoricalMatrix + object_as_cat : bool, default False + If True, DataFrame columns stored as ``pl.String`` will be treated as + categorical columns. Note that this is different from pandas, where all object + columns are converted to categorical columns. + cat_position : str {'end'|'expand'}, default 'expand' + Position of the categorical variable in the index. If "last", all the + categoricals (including the ones that did not satisfy cat_threshold) + will be placed at the end of the index list. If "expand", all the variables + will remain in the same order. + drop_first : bool, default False + If true, categoricals variables will have their first category dropped. + This allows multiple categorical variables to be included in an + unregularized model. If False, all categories are included. + cat_missing_method: str {'fail'|'zero'|'convert'}, default 'fail' + How to handle missing values in categorical columns: + - if 'fail', raise an error if there are missing values. + - if 'zero', missing values will represent all-zero indicator columns. + - if 'convert', missing values will be converted to the '(MISSING)' category. + cat_missing_name: str, default '(MISSING)' + Name of the category to which missing values will be converted if + ``cat_missing_method='convert'``. -def _dense_matrix(df, dense_columns, dtype): - return DenseMatrix( - df[dense_columns].to_numpy().astype(dtype), - column_names=dense_columns, - term_names=dense_columns, + Returns + ------- + SplitMatrix + """ + return _from_dataframe( + df, + engine="polars", + dtype=dtype, + sparse_threshold=sparse_threshold, + cat_threshold=cat_threshold, + object_as_cat=object_as_cat, + cat_position=cat_position, + drop_first=drop_first, + categorical_format=categorical_format, + cat_missing_method=cat_missing_method, + cat_missing_name=cat_missing_name, ) @@ -330,15 +350,6 @@ def _reindex_cat(indices, is_cat, mxcolidx): return new_indices -def _sparse_matrix(df, sparse_columns, dtype): - return SparseMatrix( - sps.coo_matrix(df[sparse_columns], dtype=dtype), - dtype=dtype, - column_names=sparse_columns, - term_names=sparse_columns, - ) - - def from_csc(mat: sps.csc_matrix, threshold=0.1, column_names=None, term_names=None): """ Convert a CSC-format sparse matrix into a ``SplitMatrix``.