From c59bbe57b3eca4decb03636e9d8bf11773d5e1c4 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 17 Sep 2024 21:00:03 +0000 Subject: [PATCH 1/6] test: fix test_df_apply_axis_1_complex by converting numpy value --- tests/system/large/test_remote_function.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index e224f65a01..dd4a84478f 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1726,12 +1726,18 @@ def test_df_apply_axis_1_complex(session, pd_df): def serialize_row(row): custom = { - "name": row.name.item() if hasattr(row.name, "item") else row.name, - "index": [ - idx.item() if hasattr(idx, "item") else idx for idx in row.index - ], + "name": tuple( + [ + # Use Python value rather than Numpy value to serialization. + name.item() if hasattr(name, "item") else name + for name in list(row.name) + ] + ), + "index": [idx for idx in row.index], "values": [ - val.item() if hasattr(val, "item") else val for val in row.values + # Use Python value rather than Numpy value to serialization. + val.item() if hasattr(val, "item") else val + for val in row.values ], } return str( @@ -1744,6 +1750,8 @@ def serialize_row(row): } ) + pd_result = pd_df.apply(serialize_row, axis=1) + serialize_row_remote = session.remote_function( bigframes.series.Series, str, reuse=False )(serialize_row) From 0b1d0ffa91a9064e9f2797c5962fe66e93511ad4 Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Tue, 24 Sep 2024 00:18:41 +0000 Subject: [PATCH 2/6] undo all changes --- tests/system/large/test_remote_function.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index dd4a84478f..77ea4627ec 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1726,18 +1726,10 @@ def test_df_apply_axis_1_complex(session, pd_df): def serialize_row(row): custom = { - "name": tuple( - [ - # Use Python value rather than Numpy value to serialization. - name.item() if hasattr(name, "item") else name - for name in list(row.name) - ] - ), + "name": row.name, "index": [idx for idx in row.index], "values": [ - # Use Python value rather than Numpy value to serialization. - val.item() if hasattr(val, "item") else val - for val in row.values + val.item() if hasattr(val, "item") else val for val in row.values ], } return str( @@ -1750,8 +1742,6 @@ def serialize_row(row): } ) - pd_result = pd_df.apply(serialize_row, axis=1) - serialize_row_remote = session.remote_function( bigframes.series.Series, str, reuse=False )(serialize_row) From f549683a2fe98b825f56fca81e9dba4523567bc0 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 25 Sep 2024 07:53:17 +0000 Subject: [PATCH 3/6] improve numpy value handling in gcf code --- bigframes/functions/remote_function_template.py | 13 ++++++++----- tests/system/large/test_remote_function.py | 7 +------ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/bigframes/functions/remote_function_template.py b/bigframes/functions/remote_function_template.py index c666f41daa..7aa98e7d1d 100644 --- a/bigframes/functions/remote_function_template.py +++ b/bigframes/functions/remote_function_template.py @@ -111,19 +111,22 @@ def convert_value(value, value_type): raise ValueError(f"Don't know how to handle type '{value_type}'") if value is None: return None - return value_converter(value) + py_value = value_converter(value) + pd_value = pd.Series([py_value], dtype=value_type)[0] + + # if it is a numpy object then return the underlying value + pd_value = pd_value.item() if hasattr(pd_value, "item") else pd_value + return pd_value index_values = [ - pd.Series([convert_value(col_values[i], col_types[i])], dtype=col_types[i])[0] - for i in range(index_length) + convert_value(col_values[i], col_types[i]) for i in range(index_length) ] data_col_names = col_names[index_length:] data_col_types = col_types[index_length:] data_col_values = col_values[index_length:] data_col_values = [ - pd.Series([convert_value(a, data_col_types[i])], dtype=data_col_types[i])[0] - for i, a in enumerate(data_col_values) + convert_value(a, data_col_types[i]) for i, a in enumerate(data_col_values) ] row_index = index_values[0] if len(index_values) == 1 else tuple(index_values) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 77ea4627ec..27daaabdf6 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1751,12 +1751,7 @@ def serialize_row(row): bf_result = bf_df.apply(serialize_row_remote, axis=1).to_pandas() pd_result = pd_df.apply(serialize_row, axis=1) - # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object' - # , ignore this mismatch by using check_dtype=False. - # - # bf_result.index[0].dtype is 'string[pyarrow]' while - # pd_result.index[0].dtype is 'object', ignore this mismatch by using - # check_index_type=False. + # Ignore known dtype disparities between pandas and bigframes. pandas.testing.assert_series_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) From 0eb09143af64bdb56bd26cfcf81e1833ccd88fb5 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 25 Sep 2024 07:56:24 +0000 Subject: [PATCH 4/6] enable the multiindex axis=1 test back --- tests/system/large/test_remote_function.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 4e309ffcb7..8b82a3995d 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1690,9 +1690,6 @@ def analyze(row): ), ), id="multiindex", - marks=pytest.mark.skip( - reason="TODO(b/368639580) revert this skip after fix" - ), ), pytest.param( pandas.DataFrame( From 8fae9763c52a8d00b45c1ee6cae1eeb4fffb9b21 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Wed, 25 Sep 2024 07:59:58 +0000 Subject: [PATCH 5/6] nit reword comment --- tests/system/large/test_remote_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 8b82a3995d..7bcfe97c48 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1751,7 +1751,7 @@ def serialize_row(row): bf_result = bf_df.apply(serialize_row_remote, axis=1).to_pandas() pd_result = pd_df.apply(serialize_row, axis=1) - # Ignore known dtype disparities between pandas and bigframes. + # ignore known dtype difference between pandas and bigframes pandas.testing.assert_series_equal( pd_result, bf_result, check_dtype=False, check_index_type=False ) From c6ed768451de2d8d39738ac40a0dceecc035ef54 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 27 Sep 2024 19:26:24 +0000 Subject: [PATCH 6/6] Revert "improve numpy value handling in gcf code" This reverts commit f549683a2fe98b825f56fca81e9dba4523567bc0. --- .../functions/remote_function_template.py | 13 ++++---- tests/system/large/test_remote_function.py | 30 ++++++++++++++----- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/bigframes/functions/remote_function_template.py b/bigframes/functions/remote_function_template.py index 7aa98e7d1d..c666f41daa 100644 --- a/bigframes/functions/remote_function_template.py +++ b/bigframes/functions/remote_function_template.py @@ -111,22 +111,19 @@ def convert_value(value, value_type): raise ValueError(f"Don't know how to handle type '{value_type}'") if value is None: return None - py_value = value_converter(value) - pd_value = pd.Series([py_value], dtype=value_type)[0] - - # if it is a numpy object then return the underlying value - pd_value = pd_value.item() if hasattr(pd_value, "item") else pd_value - return pd_value + return value_converter(value) index_values = [ - convert_value(col_values[i], col_types[i]) for i in range(index_length) + pd.Series([convert_value(col_values[i], col_types[i])], dtype=col_types[i])[0] + for i in range(index_length) ] data_col_names = col_names[index_length:] data_col_types = col_types[index_length:] data_col_values = col_values[index_length:] data_col_values = [ - convert_value(a, data_col_types[i]) for i, a in enumerate(data_col_values) + pd.Series([convert_value(a, data_col_types[i])], dtype=data_col_types[i])[0] + for i, a in enumerate(data_col_values) ] row_index = index_values[0] if len(index_values) == 1 else tuple(index_values) diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index 7bcfe97c48..2365002857 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1670,7 +1670,11 @@ def analyze(row): (3, 4): ["pq", "rs", "tu"], (5.0, "six", 7): [8, 9, 10], 'raise Exception("hacked!")': [11, 12, 13], - } + }, + # Default pandas index has non-numpy type, whereas bigframes is + # always numpy-based type, so let's use the index compatible + # with bigframes. See more details in b/369689696. + index=pandas.Index([0, 1, 2], dtype=pandas.Int64Dtype()), ), id="all-kinds-of-column-names", ), @@ -1681,15 +1685,23 @@ def analyze(row): "y": [1.5, 3.75, 5], "z": ["pq", "rs", "tu"], }, - index=pandas.MultiIndex.from_tuples( - [ - ("a", 100), - ("a", 200), - ("b", 300), - ] + index=pandas.MultiIndex.from_frame( + pandas.DataFrame( + { + "idx0": pandas.Series( + ["a", "a", "b"], dtype=pandas.StringDtype() + ), + "idx1": pandas.Series( + [100, 200, 300], dtype=pandas.Int64Dtype() + ), + } + ) ), ), id="multiindex", + marks=pytest.mark.skip( + reason="TODO: revert this skip after this pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/59908" + ), ), pytest.param( pandas.DataFrame( @@ -1698,6 +1710,10 @@ def analyze(row): [20, 3.75, "rs"], [30, 8.0, "tu"], ], + # Default pandas index has non-numpy type, whereas bigframes is + # always numpy-based type, so let's use the index compatible + # with bigframes. See more details in b/369689696. + index=pandas.Index([0, 1, 2], dtype=pandas.Int64Dtype()), columns=pandas.MultiIndex.from_arrays( [ ["first", "last_two", "last_two"],