Skip to content

Commit

Permalink
Merge pull request #961 from hed-standard/develop
Browse files Browse the repository at this point in the history
Merge NA
  • Loading branch information
VisLab authored Jun 14, 2024
2 parents d5deaf8 + 11de94d commit a739852
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 6 deletions.
4 changes: 2 additions & 2 deletions hed/tools/remodeling/operations/factor_hed_tags_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@


import pandas as pd
import numpy as np
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.models.tabular_input import TabularInput
from hed.models.sidecar import Sidecar
from hed.models import query_service
from hed.tools.analysis.event_manager import EventManager
from hed.tools.analysis.hed_tag_manager import HedTagManager
from hed.tools.util.data_util import replace_na


class FactorHedTagsOp(BaseOp):
Expand Down Expand Up @@ -126,7 +126,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
if len(df_factors.columns) > 0:
df_list.append(df_factors)
df_new = pd.concat(df_list, axis=1)
df_new.replace('n/a', np.nan, inplace=True)
replace_na(df_new)
return df_new

@staticmethod
Expand Down
4 changes: 2 additions & 2 deletions hed/tools/remodeling/operations/factor_hed_type_op.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
""" Append to columnar file the factors computed from type variables. """

import pandas as pd
import numpy as np
from hed.tools.remodeling.operations.base_op import BaseOp
from hed.models.tabular_input import TabularInput
from hed.tools.analysis.event_manager import EventManager
from hed.tools.analysis.hed_type_manager import HedTypeManager
from hed.tools.util.data_util import replace_na


class FactorHedTypeOp(BaseOp):
Expand Down Expand Up @@ -82,7 +82,7 @@ def do_op(self, dispatcher, df, name, sidecar=None):
if len(df_factors.columns) > 0:
df_list.append(df_factors)
df_new = pd.concat(df_list, axis=1)
df_new.replace('n/a', np.nan, inplace=True)
replace_na(df_new)
return df_new

@staticmethod
Expand Down
9 changes: 9 additions & 0 deletions hed/tools/util/data_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,15 @@ def make_info_dataframe(col_info, selected_col):
df = pd.DataFrame(sorted(list(col_values)), columns=[selected_col])
return df

def replace_na(df):
""" Replace (in place) the n/a with np.nan taking care of categorical columns. """
for column in df.columns:
if df[column].dtype.name != 'category':
df[column] = df[column].replace('n/a', np.nan)
elif 'n/a' in df[column].cat.categories:
df[column] = df[column].astype('object')
df[column] = df[column].replace('n/a', np.nan)
df[column] = pd.Categorical(df[column])

def replace_values(df, values=None, replace_value='n/a', column_list=None):
""" Replace string values in specified columns.
Expand Down
42 changes: 40 additions & 2 deletions tests/tools/util/test_data_util.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os
import unittest
import numpy as np
from pandas import DataFrame

from pandas import DataFrame, Categorical
from hed.errors.exceptions import HedFileError
from hed.tools.util.data_util import add_columns, check_match, delete_columns, delete_rows_by_column, \
get_key_hash, get_new_dataframe, get_row_hash, get_value_dict, \
make_info_dataframe, reorder_columns, replace_values, separate_values
make_info_dataframe, reorder_columns, replace_na, replace_values, separate_values


class Test(unittest.TestCase):
Expand Down Expand Up @@ -107,6 +108,43 @@ def test_make_info_dataframe(self):
df2 = make_info_dataframe(col_dict, "Baloney")
self.assertFalse(df2, "make_frame should return None if column name invalid")

def test_replace_na(self):
# With categorical column containing n/a's
df = DataFrame({
'A': Categorical(['apple', 'n/a', 'cherry']),
'B': ['n/a', 'pear', 'banana']
})
replace_na(df)
self.assertTrue(df['A'].isnull().any())
self.assertTrue(df['B'].isnull().any())

# With categorical column not containing n/a's
df = DataFrame({
'A': Categorical(['apple', 'orange', 'cherry']),
'B': ['pear', 'melon', 'banana']
})
replace_na(df)
self.assertFalse(df['A'].isnull().any())
self.assertFalse(df['B'].isnull().any())

# preserving other values
df = DataFrame({
'A': Categorical(['apple', 'n/a', 'cherry']),
'B': ['n/a', 'pear', 'banana'],
'C': [1, 2, 3]
})
replace_na(df)
self.assertEqual(list(df['C']), [1, 2, 3])

# Non-categorical n/a replacement
df = DataFrame({
'A': ['apple', 'n/a', 'cherry'],
'B': ['n/a', 'pear', 'banana']
})
replace_na(df)
self.assertTrue(df['A'].isnull().any())
self.assertTrue(df['B'].isnull().any())

def test_replace_values(self):
data = {'Name': ['n/a', '', 'tom', 'alice', 0, 1],
'Age': [np.nan, 10, '', 'n/a', '0', '10']}
Expand Down

0 comments on commit a739852

Please sign in to comment.