From e7b76836bfd06b66c24afc4d48ca30bcb0e9dd8c Mon Sep 17 00:00:00 2001
From: Tom Pollard <tpollard@mit.edu>
Date: Thu, 13 Jun 2024 01:21:22 -0400
Subject: [PATCH 1/3] Raise error if categorical columns include None/null
 values.

---
 tableone/tableone.py   |  2 +-
 tableone/validators.py | 19 ++++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tableone/tableone.py b/tableone/tableone.py
index 7472137..2bf43fc 100644
--- a/tableone/tableone.py
+++ b/tableone/tableone.py
@@ -329,10 +329,10 @@ def setup_validators(self):
         self.input_validator = InputValidator()
 
     def validate_data(self, data):
-        self.data_validator.validate(data, self._columns)  # type: ignore
         self.input_validator.validate(self._groupby, self._nonnormal, self._min_max,  # type: ignore
                                       self._pval_adjust, self._order, self._pval,  # type: ignore
                                       self._columns, self._categorical, self._continuous)  # type: ignore
+        self.data_validator.validate(data, self._columns, self._categorical)  # type: ignore
 
     def create_intermediate_tables(self, data):
         """
diff --git a/tableone/validators.py b/tableone/validators.py
index a70dc82..bc69fa8 100644
--- a/tableone/validators.py
+++ b/tableone/validators.py
@@ -10,7 +10,8 @@ def __init__(self):
         """Initialize the DataValidator class."""
         pass
 
-    def validate(self, data: pd.DataFrame, columns: list) -> None:
+    def validate(self, data: pd.DataFrame, columns: list,
+                 categorical: Optional[List[str]] = None) -> None:
         """
         Check the input dataset for obvious issues.
 
@@ -22,6 +23,22 @@ def validate(self, data: pd.DataFrame, columns: list) -> None:
         self.check_unique_index(data)
         self.check_columns_exist(data, columns)
         self.check_duplicate_columns(data, columns)
+        if categorical:
+            self.check_categorical_none(data, categorical)
+
+    def check_categorical_none(self, data: pd.DataFrame, categorical: List[str]):
+        """
+        Ensure that categorical columns do not contain None values.
+
+        Parameters:
+        data (pd.DataFrame): The DataFrame to check.
+        categorical (List[str]): The list of categorical columns to validate.
+        """
+        none_containing_cols = [col for col in categorical if data[col].isnull().any()]
+        if none_containing_cols:
+            raise InputError(f"The following categorical columns contains one or more 'None' values. These values "
+                             f"must be converted to a string before processing: {none_containing_cols}. e.g. use "
+                             f"data[categorical_columns] = data[categorical_columns].fillna('None')")
 
     def validate_input(self, data: pd.DataFrame):
         if not isinstance(data, pd.DataFrame):

From 44ef61fa6321b5680c9d33a2a269748c66eebe17 Mon Sep 17 00:00:00 2001
From: Tom Pollard <tpollard@mit.edu>
Date: Thu, 13 Jun 2024 02:41:19 -0400
Subject: [PATCH 2/3] Add auto_fill_nulls argument. Ref #114

Missing values are now treated as a category for categorical values.
---
 tableone/preprocessors.py | 16 ++++++++++++++++
 tableone/tableone.py      | 33 ++++++++++++++++++++++++---------
 tableone/validators.py    | 14 ++++++++------
 3 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/tableone/preprocessors.py b/tableone/preprocessors.py
index caf7646..ed8b18f 100644
--- a/tableone/preprocessors.py
+++ b/tableone/preprocessors.py
@@ -1,4 +1,5 @@
 import numpy as np
+import pandas as pd
 
 from tableone.exceptions import InputError
 
@@ -99,3 +100,18 @@ def get_groups(data, groupby, order, reserved_columns):
         groupbylvls = ['Overall']
 
     return groupbylvls
+
+
+def handle_categorical_nulls(df: pd.DataFrame, null_value: str = 'None') -> pd.DataFrame:
+    """
+    Convert None/Null values in specified categorical columns to a given string,
+    so they are treated as an additional category.
+
+    Parameters:
+    - data (pd.DataFrame): The DataFrame containing the categorical data.
+    - null_value (str): The string to replace null values with. Default is 'None'.
+
+    Returns:
+    - pd.DataFrame: The modified DataFrame if not inplace, otherwise None.
+    """
+    return df.fillna(null_value)
diff --git a/tableone/tableone.py b/tableone/tableone.py
index 2bf43fc..948d49a 100644
--- a/tableone/tableone.py
+++ b/tableone/tableone.py
@@ -10,7 +10,8 @@
 from tabulate import tabulate
 
 from tableone.deprecations import handle_deprecated_parameters
-from tableone.preprocessors import ensure_list, detect_categorical, order_categorical, get_groups
+from tableone.preprocessors import (ensure_list, detect_categorical, order_categorical,
+                                    get_groups, handle_categorical_nulls)
 from tableone.statistics import Statistics
 from tableone.tables import Tables
 from tableone.validators import DataValidator, InputValidator
@@ -168,6 +169,10 @@ class TableOne:
         Run Tukey's test for far outliers. If variables are found to
         have far outliers, a remark will be added below the Table 1.
         (default: False)
+    auto_fill_nulls : bool, optional
+        Attempt to automatically handle None/Null values in categorical columns
+        by treating them as a category named 'None'. (default: True)
+
 
     Attributes
     ----------
@@ -219,7 +224,8 @@ def __init__(self, data: pd.DataFrame,
                  row_percent: bool = False, display_all: bool = False,
                  dip_test: bool = False, normal_test: bool = False,
                  tukey_test: bool = False,
-                 pval_threshold: Optional[float] = None) -> None:
+                 pval_threshold: Optional[float] = None,
+                 auto_fill_nulls: Optional[bool] = True) -> None:
 
         # Warn about deprecated parameters
         handle_deprecated_parameters(labels, isnull, pval_test_name, remarks)
@@ -229,11 +235,12 @@ def __init__(self, data: pd.DataFrame,
         self.tables = Tables()
 
         # Initialize attributes
-        self.initialize_core_attributes(data, columns, categorical, continuous, groupby,
-                                        nonnormal, min_max, pval, pval_adjust, htest_name,
-                                        htest, missing, ddof, rename, sort, limit, order,
-                                        label_suffix, decimals, smd, overall, row_percent,
-                                        dip_test, normal_test, tukey_test, pval_threshold)
+        data = self.initialize_core_attributes(data, columns, categorical, continuous, groupby,
+                                               nonnormal, min_max, pval, pval_adjust, htest_name,
+                                               htest, missing, ddof, rename, sort, limit, order,
+                                               label_suffix, decimals, smd, overall, row_percent,
+                                               dip_test, normal_test, tukey_test, pval_threshold,
+                                               auto_fill_nulls)
 
         # Initialize intermediate tables
         self.initialize_intermediate_tables()
@@ -274,11 +281,13 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro
                                    nonnormal, min_max, pval, pval_adjust, htest_name,
                                    htest, missing, ddof, rename, sort, limit, order,
                                    label_suffix, decimals, smd, overall, row_percent, 
-                                   dip_test, normal_test, tukey_test, pval_threshold):
+                                   dip_test, normal_test, tukey_test, pval_threshold,
+                                   auto_fill_nulls):
         """
         Initialize attributes.
         """
         self._alt_labels = rename
+        self._auto_fill_nulls = auto_fill_nulls
         self._columns = columns if columns else data.columns.to_list()  # type: ignore
         self._categorical = detect_categorical(data[self._columns], groupby) if categorical is None else categorical
         if continuous:
@@ -308,8 +317,14 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro
         self._sort = sort
         self._tukey_test = tukey_test
         self._warnings = {}
+
+        if self._categorical and self._auto_fill_nulls:
+            data[self._categorical] = handle_categorical_nulls(data[self._categorical])
+
         self._groupbylvls = get_groups(data, self._groupby, self._order, self._reserved_columns)
 
+        return data
+
     def initialize_intermediate_tables(self):
         """
         Initialize the intermediate tables.
@@ -332,7 +347,7 @@ def validate_data(self, data):
         self.input_validator.validate(self._groupby, self._nonnormal, self._min_max,  # type: ignore
                                       self._pval_adjust, self._order, self._pval,  # type: ignore
                                       self._columns, self._categorical, self._continuous)  # type: ignore
-        self.data_validator.validate(data, self._columns, self._categorical)  # type: ignore
+        self.data_validator.validate(data, self._columns, self._categorical, self._auto_fill_nulls)  # type: ignore
 
     def create_intermediate_tables(self, data):
         """
diff --git a/tableone/validators.py b/tableone/validators.py
index bc69fa8..d738293 100644
--- a/tableone/validators.py
+++ b/tableone/validators.py
@@ -11,7 +11,8 @@ def __init__(self):
         pass
 
     def validate(self, data: pd.DataFrame, columns: list,
-                 categorical: Optional[List[str]] = None) -> None:
+                 categorical: list,
+                 auto_fill_nulls: bool) -> None:
         """
         Check the input dataset for obvious issues.
 
@@ -23,7 +24,7 @@ def validate(self, data: pd.DataFrame, columns: list,
         self.check_unique_index(data)
         self.check_columns_exist(data, columns)
         self.check_duplicate_columns(data, columns)
-        if categorical:
+        if categorical and not auto_fill_nulls:
             self.check_categorical_none(data, categorical)
 
     def check_categorical_none(self, data: pd.DataFrame, categorical: List[str]):
@@ -34,10 +35,11 @@ def check_categorical_none(self, data: pd.DataFrame, categorical: List[str]):
         data (pd.DataFrame): The DataFrame to check.
         categorical (List[str]): The list of categorical columns to validate.
         """
-        none_containing_cols = [col for col in categorical if data[col].isnull().any()]
-        if none_containing_cols:
-            raise InputError(f"The following categorical columns contains one or more 'None' values. These values "
-                             f"must be converted to a string before processing: {none_containing_cols}. e.g. use "
+        contains_none = [col for col in categorical if data[col].isnull().any()]
+        if contains_none:
+            raise InputError(f"The following categorical columns contains one or more null values: {contains_none}. "
+                             f"These must be converted to strings before processing. Either set "
+                             f"`auto_fill_nulls = True` or manually convert nulls to strings with: "
                              f"data[categorical_columns] = data[categorical_columns].fillna('None')")
 
     def validate_input(self, data: pd.DataFrame):

From 02a73260f4642ef2b1f86ce68f935a9f798abd10 Mon Sep 17 00:00:00 2001
From: Tom Pollard <tpollard@mit.edu>
Date: Fri, 14 Jun 2024 00:59:47 -0400
Subject: [PATCH 3/3] Rename auto_fill_nulls to include_null. Handle
 include_null=False and include_null=True. Ref #114.

---
 tableone/tableone.py        | 21 ++++++++++++---------
 tableone/tables.py          | 26 ++++++++++++++++++++------
 tableone/validators.py      | 19 +------------------
 tests/unit/test_tableone.py | 15 ++++++++-------
 4 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/tableone/tableone.py b/tableone/tableone.py
index 948d49a..2a1a062 100644
--- a/tableone/tableone.py
+++ b/tableone/tableone.py
@@ -169,9 +169,9 @@ class TableOne:
         Run Tukey's test for far outliers. If variables are found to
         have far outliers, a remark will be added below the Table 1.
         (default: False)
-    auto_fill_nulls : bool, optional
-        Attempt to automatically handle None/Null values in categorical columns
-        by treating them as a category named 'None'. (default: True)
+    include_null : bool, optional
+        Include None/Null values for categorical variables by treating them as a
+        category level. (default: True)
 
 
     Attributes
@@ -225,7 +225,7 @@ def __init__(self, data: pd.DataFrame,
                  dip_test: bool = False, normal_test: bool = False,
                  tukey_test: bool = False,
                  pval_threshold: Optional[float] = None,
-                 auto_fill_nulls: Optional[bool] = True) -> None:
+                 include_null: Optional[bool] = True) -> None:
 
         # Warn about deprecated parameters
         handle_deprecated_parameters(labels, isnull, pval_test_name, remarks)
@@ -240,7 +240,7 @@ def __init__(self, data: pd.DataFrame,
                                                htest, missing, ddof, rename, sort, limit, order,
                                                label_suffix, decimals, smd, overall, row_percent,
                                                dip_test, normal_test, tukey_test, pval_threshold,
-                                               auto_fill_nulls)
+                                               include_null)
 
         # Initialize intermediate tables
         self.initialize_intermediate_tables()
@@ -282,12 +282,12 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro
                                    htest, missing, ddof, rename, sort, limit, order,
                                    label_suffix, decimals, smd, overall, row_percent, 
                                    dip_test, normal_test, tukey_test, pval_threshold,
-                                   auto_fill_nulls):
+                                   include_null):
         """
         Initialize attributes.
         """
         self._alt_labels = rename
-        self._auto_fill_nulls = auto_fill_nulls
+        self._include_null = include_null
         self._columns = columns if columns else data.columns.to_list()  # type: ignore
         self._categorical = detect_categorical(data[self._columns], groupby) if categorical is None else categorical
         if continuous:
@@ -318,7 +318,7 @@ def initialize_core_attributes(self, data, columns, categorical, continuous, gro
         self._tukey_test = tukey_test
         self._warnings = {}
 
-        if self._categorical and self._auto_fill_nulls:
+        if self._categorical and self._include_null:
             data[self._categorical] = handle_categorical_nulls(data[self._categorical])
 
         self._groupbylvls = get_groups(data, self._groupby, self._order, self._reserved_columns)
@@ -347,7 +347,7 @@ def validate_data(self, data):
         self.input_validator.validate(self._groupby, self._nonnormal, self._min_max,  # type: ignore
                                       self._pval_adjust, self._order, self._pval,  # type: ignore
                                       self._columns, self._categorical, self._continuous)  # type: ignore
-        self.data_validator.validate(data, self._columns, self._categorical, self._auto_fill_nulls)  # type: ignore
+        self.data_validator.validate(data, self._columns, self._categorical, self._include_null)  # type: ignore
 
     def create_intermediate_tables(self, data):
         """
@@ -366,6 +366,7 @@ def create_intermediate_tables(self, data):
                                                                     self._categorical,
                                                                     self._decimals,
                                                                     self._row_percent,
+                                                                    self._include_null,
                                                                     groupby=None,
                                                                     groupbylvls=['Overall'])
 
@@ -385,6 +386,7 @@ def create_intermediate_tables(self, data):
                                                                 self._categorical,
                                                                 self._decimals,
                                                                 self._row_percent,
+                                                                self._include_null,
                                                                 groupby=self._groupby,
                                                                 groupbylvls=self._groupbylvls)
 
@@ -413,6 +415,7 @@ def create_intermediate_tables(self, data):
                                                           self._overall,
                                                           self.cat_describe,
                                                           self._categorical,
+                                                          self._include_null,
                                                           self._pval,
                                                           self._pval_adjust,
                                                           self.htest_table,
diff --git a/tableone/tables.py b/tableone/tables.py
index e35c743..b15ba65 100644
--- a/tableone/tables.py
+++ b/tableone/tables.py
@@ -193,6 +193,7 @@ def create_cat_describe(self,
                             categorical,
                             decimals,
                             row_percent,
+                            include_null,
                             groupby: Optional[str] = None,
                             groupbylvls: Optional[list] = None
                             ) -> pd.DataFrame:
@@ -223,12 +224,19 @@ def create_cat_describe(self,
             else:
                 df = cat_slice.copy()
 
-            # create n column and null count column
+            # create n column
             # must be done before converting values to strings
             ct = df.count().to_frame(name='n')
             ct.index.name = 'variable'
-            nulls = df.isnull().sum().to_frame(name='Missing')
-            nulls.index.name = 'variable'
+
+            if include_null:
+                # create an empty Missing column for display purposes
+                nulls = pd.DataFrame('', index=df.columns, columns=['Missing'])
+                nulls.index.name = 'variable'
+            else:
+                # Count and display null count
+                nulls = df.isnull().sum().to_frame(name='Missing')
+                nulls.index.name = 'variable'
 
             # Convert to str to handle int converted to boolean in the index.
             # Also avoid nans.
@@ -445,6 +453,7 @@ def create_cat_table(self,
                          overall,
                          cat_describe,
                          categorical,
+                         include_null,
                          pval,
                          pval_adjust,
                          htest_table,
@@ -462,9 +471,14 @@ def create_cat_table(self,
         """
         table = cat_describe['t1_summary'].copy()
 
-        # add the total count of null values across all levels
-        isnull = data[categorical].isnull().sum().to_frame(name='Missing')
-        isnull.index = isnull.index.rename('variable')
+        if include_null:
+            isnull = pd.DataFrame(index=categorical, columns=['Missing'])
+            isnull['Missing'] = ''
+            isnull.index.rename('variable', inplace=True)
+        else:
+            # add the total count of null values across all levels
+            isnull = data[categorical].isnull().sum().to_frame(name='Missing')
+            isnull.index = isnull.index.rename('variable')
 
         try:
             table = table.join(isnull)
diff --git a/tableone/validators.py b/tableone/validators.py
index d738293..879da63 100644
--- a/tableone/validators.py
+++ b/tableone/validators.py
@@ -12,7 +12,7 @@ def __init__(self):
 
     def validate(self, data: pd.DataFrame, columns: list,
                  categorical: list,
-                 auto_fill_nulls: bool) -> None:
+                 include_null: bool) -> None:
         """
         Check the input dataset for obvious issues.
 
@@ -24,23 +24,6 @@ def validate(self, data: pd.DataFrame, columns: list,
         self.check_unique_index(data)
         self.check_columns_exist(data, columns)
         self.check_duplicate_columns(data, columns)
-        if categorical and not auto_fill_nulls:
-            self.check_categorical_none(data, categorical)
-
-    def check_categorical_none(self, data: pd.DataFrame, categorical: List[str]):
-        """
-        Ensure that categorical columns do not contain None values.
-
-        Parameters:
-        data (pd.DataFrame): The DataFrame to check.
-        categorical (List[str]): The list of categorical columns to validate.
-        """
-        contains_none = [col for col in categorical if data[col].isnull().any()]
-        if contains_none:
-            raise InputError(f"The following categorical columns contains one or more null values: {contains_none}. "
-                             f"These must be converted to strings before processing. Either set "
-                             f"`auto_fill_nulls = True` or manually convert nulls to strings with: "
-                             f"data[categorical_columns] = data[categorical_columns].fillna('None')")
 
     def validate_input(self, data: pd.DataFrame):
         if not isinstance(data, pd.DataFrame):
diff --git a/tests/unit/test_tableone.py b/tests/unit/test_tableone.py
index e205528..601529b 100644
--- a/tests/unit/test_tableone.py
+++ b/tests/unit/test_tableone.py
@@ -216,7 +216,7 @@ def test_overall_n_and_percent_for_binary_cat_var_with_nan(
         """
         categorical = ['likeshoney']
         table = TableOne(data_sample, columns=categorical,
-                         categorical=categorical)
+                         categorical=categorical, include_null=False)
 
         lh = table.cat_describe.loc['likeshoney']
 
@@ -796,7 +796,8 @@ def test_nan_rows_not_deleted_in_categorical_columns(self):
 
         # create tableone
         t1 = TableOne(df, label_suffix=False,
-                      categorical=['basket1', 'basket2', 'basket3', 'basket4'])
+                      categorical=['basket1', 'basket2', 'basket3', 'basket4'],
+                      include_null=False)
 
         assert all(t1.tableone.loc['basket1'].index == ['apple', 'banana',
                                                         'durian', 'lemon',
@@ -1028,7 +1029,7 @@ def test_order_of_order_categorical_columns(self):
 
         # if a custom order is not specified, the categorical order
         # specified above should apply
-        t1 = TableOne(data, label_suffix=False)
+        t1 = TableOne(data, label_suffix=False, include_null=False)
 
         t1_expected_order = {'month': ["feb", "jan", "mar", "apr"],
                              'day': ["wed", "thu", "mon", "tue"]}
@@ -1039,7 +1040,7 @@ def test_order_of_order_categorical_columns(self):
                     t1_expected_order[k])
 
         # if a desired order is set, it should override the order
-        t2 = TableOne(data, order=order, label_suffix=False)
+        t2 = TableOne(data, order=order, label_suffix=False, include_null=False)
 
         t2_expected_order = {'month': ["jan", "feb", "mar", "apr"],
                              'day': ["mon", "tue", "wed", "thu"]}
@@ -1104,7 +1105,7 @@ def test_row_percent_false(self, data_pn):
         t1 = TableOne(data_pn, columns=columns,
                       categorical=categorical, groupby=groupby,
                       nonnormal=nonnormal, decimals=decimals,
-                      row_percent=False)
+                      row_percent=False, include_null=False)
 
         row1 = list(t1.tableone.loc["MechVent, n (%)"][group].values[0])
         row1_expect = [0, '540 (54.0)', '468 (54.2)', '72 (52.9)']
@@ -1154,7 +1155,7 @@ def test_row_percent_true(self, data_pn):
         t2 = TableOne(data_pn, columns=columns,
                       categorical=categorical, groupby=groupby,
                       nonnormal=nonnormal, decimals=decimals,
-                      row_percent=True)
+                      row_percent=True, include_null=False)
 
         row1 = list(t2.tableone.loc["MechVent, n (%)"][group].values[0])
         row1_expect = [0, '540 (100.0)', '468 (86.7)', '72 (13.3)']
@@ -1204,7 +1205,7 @@ def test_row_percent_true_and_overall_false(self, data_pn):
         t1 = TableOne(data_pn, columns=columns, overall=False,
                       categorical=categorical, groupby=groupby,
                       nonnormal=nonnormal, decimals=decimals,
-                      row_percent=True)
+                      row_percent=True, include_null=False)
 
         row1 = list(t1.tableone.loc["MechVent, n (%)"][group].values[0])
         row1_expect = [0, '468 (86.7)', '72 (13.3)']