From c59bbe57b3eca4decb03636e9d8bf11773d5e1c4 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 17 Sep 2024 21:00:03 +0000
Subject: [PATCH 1/6] test: fix test_df_apply_axis_1_complex by converting
 numpy value

---
 tests/system/large/test_remote_function.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index e224f65a01..dd4a84478f 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -1726,12 +1726,18 @@ def test_df_apply_axis_1_complex(session, pd_df):
 
         def serialize_row(row):
             custom = {
-                "name": row.name.item() if hasattr(row.name, "item") else row.name,
-                "index": [
-                    idx.item() if hasattr(idx, "item") else idx for idx in row.index
-                ],
+                "name": tuple(
+                    [
+                        # Use Python value rather than Numpy value to serialization.
+                        name.item() if hasattr(name, "item") else name
+                        for name in list(row.name)
+                    ]
+                ),
+                "index": [idx for idx in row.index],
                 "values": [
-                    val.item() if hasattr(val, "item") else val for val in row.values
+                    # Use Python value rather than Numpy value to serialization.
+                    val.item() if hasattr(val, "item") else val
+                    for val in row.values
                 ],
             }
             return str(
@@ -1744,6 +1750,8 @@ def serialize_row(row):
                 }
             )
 
+        pd_result = pd_df.apply(serialize_row, axis=1)
+
         serialize_row_remote = session.remote_function(
             bigframes.series.Series, str, reuse=False
         )(serialize_row)

From 0b1d0ffa91a9064e9f2797c5962fe66e93511ad4 Mon Sep 17 00:00:00 2001
From: Chelsea Lin <chelsealin@google.com>
Date: Tue, 24 Sep 2024 00:18:41 +0000
Subject: [PATCH 2/6] undo all changes

---
 tests/system/large/test_remote_function.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index dd4a84478f..77ea4627ec 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -1726,18 +1726,10 @@ def test_df_apply_axis_1_complex(session, pd_df):
 
         def serialize_row(row):
             custom = {
-                "name": tuple(
-                    [
-                        # Use Python value rather than Numpy value to serialization.
-                        name.item() if hasattr(name, "item") else name
-                        for name in list(row.name)
-                    ]
-                ),
+                "name": row.name,
                 "index": [idx for idx in row.index],
                 "values": [
-                    # Use Python value rather than Numpy value to serialization.
-                    val.item() if hasattr(val, "item") else val
-                    for val in row.values
+                    val.item() if hasattr(val, "item") else val for val in row.values
                 ],
             }
             return str(
@@ -1750,8 +1742,6 @@ def serialize_row(row):
                 }
             )
 
-        pd_result = pd_df.apply(serialize_row, axis=1)
-
         serialize_row_remote = session.remote_function(
             bigframes.series.Series, str, reuse=False
         )(serialize_row)

From f549683a2fe98b825f56fca81e9dba4523567bc0 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Wed, 25 Sep 2024 07:53:17 +0000
Subject: [PATCH 3/6] improve numpy value handling in gcf code

---
 bigframes/functions/remote_function_template.py | 13 ++++++++-----
 tests/system/large/test_remote_function.py      |  7 +------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/bigframes/functions/remote_function_template.py b/bigframes/functions/remote_function_template.py
index c666f41daa..7aa98e7d1d 100644
--- a/bigframes/functions/remote_function_template.py
+++ b/bigframes/functions/remote_function_template.py
@@ -111,19 +111,22 @@ def convert_value(value, value_type):
             raise ValueError(f"Don't know how to handle type '{value_type}'")
         if value is None:
             return None
-        return value_converter(value)
+        py_value = value_converter(value)
+        pd_value = pd.Series([py_value], dtype=value_type)[0]
+
+        # if it is a numpy object then return the underlying value
+        pd_value = pd_value.item() if hasattr(pd_value, "item") else pd_value
+        return pd_value
 
     index_values = [
-        pd.Series([convert_value(col_values[i], col_types[i])], dtype=col_types[i])[0]
-        for i in range(index_length)
+        convert_value(col_values[i], col_types[i]) for i in range(index_length)
     ]
 
     data_col_names = col_names[index_length:]
     data_col_types = col_types[index_length:]
     data_col_values = col_values[index_length:]
     data_col_values = [
-        pd.Series([convert_value(a, data_col_types[i])], dtype=data_col_types[i])[0]
-        for i, a in enumerate(data_col_values)
+        convert_value(a, data_col_types[i]) for i, a in enumerate(data_col_values)
     ]
 
     row_index = index_values[0] if len(index_values) == 1 else tuple(index_values)
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index 77ea4627ec..27daaabdf6 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -1751,12 +1751,7 @@ def serialize_row(row):
         bf_result = bf_df.apply(serialize_row_remote, axis=1).to_pandas()
         pd_result = pd_df.apply(serialize_row, axis=1)
 
-        # bf_result.dtype is 'string[pyarrow]' while pd_result.dtype is 'object'
-        # , ignore this mismatch by using check_dtype=False.
-        #
-        # bf_result.index[0].dtype is 'string[pyarrow]' while
-        # pd_result.index[0].dtype is 'object', ignore this mismatch by using
-        # check_index_type=False.
+        # Ignore known dtype disparities between pandas and bigframes.
         pandas.testing.assert_series_equal(
             pd_result, bf_result, check_dtype=False, check_index_type=False
         )

From 0eb09143af64bdb56bd26cfcf81e1833ccd88fb5 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Wed, 25 Sep 2024 07:56:24 +0000
Subject: [PATCH 4/6] enable the multiindex axis=1 test back

---
 tests/system/large/test_remote_function.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index 4e309ffcb7..8b82a3995d 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -1690,9 +1690,6 @@ def analyze(row):
                 ),
             ),
             id="multiindex",
-            marks=pytest.mark.skip(
-                reason="TODO(b/368639580) revert this skip after fix"
-            ),
         ),
         pytest.param(
             pandas.DataFrame(

From 8fae9763c52a8d00b45c1ee6cae1eeb4fffb9b21 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Wed, 25 Sep 2024 07:59:58 +0000
Subject: [PATCH 5/6] nit reword comment

---
 tests/system/large/test_remote_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index 8b82a3995d..7bcfe97c48 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -1751,7 +1751,7 @@ def serialize_row(row):
         bf_result = bf_df.apply(serialize_row_remote, axis=1).to_pandas()
         pd_result = pd_df.apply(serialize_row, axis=1)
 
-        # Ignore known dtype disparities between pandas and bigframes.
+        # ignore known dtype difference between pandas and bigframes
         pandas.testing.assert_series_equal(
             pd_result, bf_result, check_dtype=False, check_index_type=False
         )

From c6ed768451de2d8d39738ac40a0dceecc035ef54 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Fri, 27 Sep 2024 19:26:24 +0000
Subject: [PATCH 6/6] Revert "improve numpy value handling in gcf code"

This reverts commit f549683a2fe98b825f56fca81e9dba4523567bc0.
---
 .../functions/remote_function_template.py     | 13 ++++----
 tests/system/large/test_remote_function.py    | 30 ++++++++++++++-----
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/bigframes/functions/remote_function_template.py b/bigframes/functions/remote_function_template.py
index 7aa98e7d1d..c666f41daa 100644
--- a/bigframes/functions/remote_function_template.py
+++ b/bigframes/functions/remote_function_template.py
@@ -111,22 +111,19 @@ def convert_value(value, value_type):
             raise ValueError(f"Don't know how to handle type '{value_type}'")
         if value is None:
             return None
-        py_value = value_converter(value)
-        pd_value = pd.Series([py_value], dtype=value_type)[0]
-
-        # if it is a numpy object then return the underlying value
-        pd_value = pd_value.item() if hasattr(pd_value, "item") else pd_value
-        return pd_value
+        return value_converter(value)
 
     index_values = [
-        convert_value(col_values[i], col_types[i]) for i in range(index_length)
+        pd.Series([convert_value(col_values[i], col_types[i])], dtype=col_types[i])[0]
+        for i in range(index_length)
     ]
 
     data_col_names = col_names[index_length:]
     data_col_types = col_types[index_length:]
     data_col_values = col_values[index_length:]
     data_col_values = [
-        convert_value(a, data_col_types[i]) for i, a in enumerate(data_col_values)
+        pd.Series([convert_value(a, data_col_types[i])], dtype=data_col_types[i])[0]
+        for i, a in enumerate(data_col_values)
     ]
 
     row_index = index_values[0] if len(index_values) == 1 else tuple(index_values)
diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py
index 7bcfe97c48..2365002857 100644
--- a/tests/system/large/test_remote_function.py
+++ b/tests/system/large/test_remote_function.py
@@ -1670,7 +1670,11 @@ def analyze(row):
                     (3, 4): ["pq", "rs", "tu"],
                     (5.0, "six", 7): [8, 9, 10],
                     'raise Exception("hacked!")': [11, 12, 13],
-                }
+                },
+                # Default pandas index has non-numpy type, whereas bigframes is
+                # always numpy-based type, so let's use the index compatible
+                # with bigframes. See more details in b/369689696.
+                index=pandas.Index([0, 1, 2], dtype=pandas.Int64Dtype()),
             ),
             id="all-kinds-of-column-names",
         ),
@@ -1681,15 +1685,23 @@ def analyze(row):
                     "y": [1.5, 3.75, 5],
                     "z": ["pq", "rs", "tu"],
                 },
-                index=pandas.MultiIndex.from_tuples(
-                    [
-                        ("a", 100),
-                        ("a", 200),
-                        ("b", 300),
-                    ]
+                index=pandas.MultiIndex.from_frame(
+                    pandas.DataFrame(
+                        {
+                            "idx0": pandas.Series(
+                                ["a", "a", "b"], dtype=pandas.StringDtype()
+                            ),
+                            "idx1": pandas.Series(
+                                [100, 200, 300], dtype=pandas.Int64Dtype()
+                            ),
+                        }
+                    )
                 ),
             ),
             id="multiindex",
+            marks=pytest.mark.skip(
+                reason="TODO: revert this skip after this pandas bug is fixed: https://github.com/pandas-dev/pandas/issues/59908"
+            ),
         ),
         pytest.param(
             pandas.DataFrame(
@@ -1698,6 +1710,10 @@ def analyze(row):
                     [20, 3.75, "rs"],
                     [30, 8.0, "tu"],
                 ],
+                # Default pandas index has non-numpy type, whereas bigframes is
+                # always numpy-based type, so let's use the index compatible
+                # with bigframes. See more details in b/369689696.
+                index=pandas.Index([0, 1, 2], dtype=pandas.Int64Dtype()),
                 columns=pandas.MultiIndex.from_arrays(
                     [
                         ["first", "last_two", "last_two"],