xorbitsai · luweizheng · Sep 18, 2024 · Aug 25, 2024 · Aug 25, 2024 · Aug 25, 2024
diff --git a/.github/workflows/asv.yaml b/.github/workflows/asv.yaml
@@ -54,7 +54,7 @@ jobs:
  if: ${{ steps.build.outcome == 'success' }}
 
  - name: Publish benchmarks artifact
- uses: actions/upload-artifact@v2
+ uses: actions/upload-artifact@v4
  with:
  name: Benchmarks log
  path: benchmarks/asv_bench/results

diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -92,9 +92,10 @@ jobs:
  - { os: ubuntu-20.04, module: hadoop, python-version: 3.9 }
  - { os: ubuntu-latest, module: vineyard, python-version: 3.9 }
  - { os: ubuntu-latest, module: external-storage, python-version: 3.9 }
- - { os: ubuntu-latest, module: compatibility, python-version: 3.9 }
+ # always test compatibility with the latest version
+ # - { os: ubuntu-latest, module: compatibility, python-version: 3.9 }
  - { os: ubuntu-latest, module: doc-build, python-version: 3.9 }
- - { os: [self-hosted, gpu], module: gpu, python-version: 3.11}
+ - { os: self-hosted, module: gpu, python-version: 3.11}
  - { os: ubuntu-latest, module: jax, python-version: 3.9 }
  # a self-hosted runner which needs computing resources, activate when necessary
  # - { os: juicefs-ci, module: kubernetes-juicefs, python-version: 3.9 }

diff --git a/python/xorbits/_mars/_utils.pyx b/python/xorbits/_mars/_utils.pyx
@@ -207,7 +207,7 @@ cdef list tokenize_pandas_dataframe(ob):
 
 
 cdef list tokenize_pandas_categorical(ob):
- l = ob.to_list()
+ l = ob.tolist()
  l.append(ob.shape)
  return iterative_tokenize(l)
 

diff --git a/python/xorbits/_mars/dataframe/base/accessor.py b/python/xorbits/_mars/dataframe/base/accessor.py
@@ -17,12 +17,7 @@
 from typing import Iterable
 
 import pandas as pd
-from pandas.api.types import (
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_period_dtype,
- is_timedelta64_dtype,
-)
+from pandas.api.types import is_datetime64_dtype, is_timedelta64_dtype
 
 from ...utils import adapt_mars_docstring
 from .datetimes import SeriesDatetimeMethod, _datetime_method_to_handlers
@@ -238,9 +233,9 @@ class DatetimeAccessor:
  def __init__(self, series):
  if (
  not is_datetime64_dtype(series.dtype)
- and not is_datetime64tz_dtype(series.dtype)
+ and not isinstance(series.dtype, pd.DatetimeTZDtype)
  and not is_timedelta64_dtype(series.dtype)
- and not is_period_dtype(series.dtype)
+ and not isinstance(series.dtype, pd.PeriodDtype)
  ):
  raise AttributeError("Can only use .dt accessor with datetimelike values")
  self._series = series

diff --git a/python/xorbits/_mars/dataframe/base/apply.py b/python/xorbits/_mars/dataframe/base/apply.py
@@ -107,9 +107,10 @@ def execute(cls, ctx, op):
  **op.kwds,
  )
  else:
- result = input_data.apply(
- func, convert_dtype=op.convert_dtype, args=op.args, **op.kwds
- )
+ if op.convert_dtype:
+ result = input_data.apply(func, args=op.args, **op.kwds)
+ else:
+ result = input_data.apply(func, args=op.args, **op.kwds).astype(object)
  ctx[out.key] = result
 
  @classmethod

diff --git a/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_apply_execution.py
@@ -210,7 +210,7 @@ def subtract_custom_value(x, custom_value):
  ).execute()
  assert res.data_params["dtype"] == "object"
  pd.testing.assert_series_equal(
- res.fetch(), s.apply(apply_func, args=(5,), convert_dtype=False)
+ res.fetch(), s.apply(apply_func, args=(5,)).astype(object)
  )
 
  res = ms.apply(
@@ -220,9 +220,7 @@ def subtract_custom_value(x, custom_value):
  assert res.shape == (4,)
  with pytest.raises(AttributeError):
  _ = res.dtypes
- pd.testing.assert_series_equal(
- res.fetch(), s.apply(apply_func, args=(5,), convert_dtype=True)
- )
+ pd.testing.assert_series_equal(res.fetch(), s.apply(apply_func, args=(5,)))
 
 
 def test_apply_execution_with_multi_chunks(setup):

diff --git a/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py b/python/xorbits/_mars/dataframe/base/tests/test_base_execution.py
@@ -476,7 +476,7 @@ def test_series_apply_execute(setup):
 
  r = series.apply(lambda x: [x, x + 1], convert_dtype=False)
  result = r.execute().fetch()
- expected = s_raw.apply(lambda x: [x, x + 1], convert_dtype=False)
+ expected = s_raw.apply(lambda x: [x, x + 1]).astype(object)
  pd.testing.assert_series_equal(result, expected)
 
  s_raw2 = pd.Series([np.array([1, 2, 3]), np.array([4, 5, 6])])
@@ -502,7 +502,7 @@ def closure(z):
 
  r = series.apply(closure, convert_dtype=False)
  result = r.execute().fetch()
- expected = s_raw.apply(closure, convert_dtype=False)
+ expected = s_raw.apply(closure).astype(object)
  pd.testing.assert_series_equal(result, expected)
 
  class callable_series:
@@ -518,7 +518,7 @@ def __call__(self, z):
  cs = callable_series()
  r = series.apply(cs, convert_dtype=False)
  result = r.execute().fetch()
- expected = s_raw.apply(cs, convert_dtype=False)
+ expected = s_raw.apply(cs).astype(object)
  pd.testing.assert_series_equal(result, expected)
 
 
@@ -528,9 +528,9 @@ def test_apply_with_arrow_dtype_execution(setup):
  df1 = table.to_pandas(types_mapper=pd.ArrowDtype)
  df = from_pandas_df(df1)
 
- r = df.apply(lambda row: str(row[0]) + row[1], axis=1)
+ r = df.apply(lambda row: str(row.iloc[0]) + row.iloc[1], axis=1)
  result = r.execute().fetch()
- expected = df1.apply(lambda row: str(row[0]) + row[1], axis=1)
+ expected = df1.apply(lambda row: str(row.iloc[0]) + row.iloc[1], axis=1)
  pd.testing.assert_series_equal(result, expected)
 
  s1 = df1["b"]

diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource.py
@@ -599,7 +599,7 @@ def test_date_range():
  with pytest.raises(ValueError):
  _ = date_range(pd.NaT, periods=10)
 
- expected = pd.date_range("2020-1-1", periods=9.0, name="date")
+ expected = pd.date_range("2020-1-1", periods=9, name="date")
 
  dr = date_range("2020-1-1", periods=9.0, name="date", chunk_size=3)
  assert isinstance(dr, DatetimeIndex)

diff --git a/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py b/python/xorbits/_mars/dataframe/datasource/tests/test_datasource_execution.py
@@ -1281,11 +1281,11 @@ def test_date_range_execution(setup):
 
  # start, end and freq
  dr = md.date_range(
- "2020-1-1", "2020-1-10", freq="12H", chunk_size=chunk_size, **kw
+ "2020-1-1", "2020-1-10", freq="12h", chunk_size=chunk_size, **kw
  )
 
  result = dr.execute().fetch()
- expected = pd.date_range("2020-1-1", "2020-1-10", freq="12H", **kw)
+ expected = pd.date_range("2020-1-1", "2020-1-10", freq="12h", **kw)
  pd.testing.assert_index_equal(result, expected)
 
  # test timezone
@@ -1317,15 +1317,15 @@ def test_date_range_execution(setup):
  pd.testing.assert_index_equal(result, expected)
 
  # test freq
- dr = md.date_range(start="1/1/2018", periods=5, freq="M", chunk_size=3)
+ dr = md.date_range(start="1/1/2018", periods=5, freq="ME", chunk_size=3)
 
  result = dr.execute().fetch()
- expected = pd.date_range(start="1/1/2018", periods=5, freq="M")
+ expected = pd.date_range(start="1/1/2018", periods=5, freq="ME")
  pd.testing.assert_index_equal(result, expected)
 
- dr = md.date_range(start="2018/01/01", end="2018/07/01", freq="M")
+ dr = md.date_range(start="2018/01/01", end="2018/07/01", freq="ME")
  result = dr.execute().fetch()
- expected = pd.date_range(start="2018/01/01", end="2018/07/01", freq="M")
+ expected = pd.date_range(start="2018/01/01", end="2018/07/01", freq="ME")
  pd.testing.assert_index_equal(result, expected)
 
 

diff --git a/python/xorbits/_mars/dataframe/indexing/index_lib.py b/python/xorbits/_mars/dataframe/indexing/index_lib.py
@@ -815,7 +815,7 @@ def _create_reorder_chunk(
  reorder_indexes[-1]
  ]
  params["columns_value"] = parse_index(reorder_columns, store_data=True)
- params["dtypes"] = concat_chunk.dtypes[reorder_indexes[-1]]
+ params["dtypes"] = concat_chunk.dtypes.iloc[reorder_indexes[-1]]
 
  return reorder_chunk_op.new_chunk([concat_chunk], kws=[params])
 

diff --git a/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py b/python/xorbits/_mars/dataframe/indexing/tests/test_indexing_execution.py
@@ -1725,6 +1725,7 @@ def test_sample_execution(setup):
 
 def test_loc_setitem(setup):
  raw_df = pd.DataFrame({"a": [1, 2, 3, 4, 2, 4, 5, 7, 2, 8, 9], 1: [10] * 11})
+ raw_df = raw_df.astype("object")
  md_data = md.DataFrame(raw_df, chunk_size=3)
  md_data.loc[md_data["a"] <= 4, 1] = "v1"
  pd_data = raw_df.copy(True)

diff --git a/python/xorbits/_mars/dataframe/missing/fillna.py b/python/xorbits/_mars/dataframe/missing/fillna.py
@@ -89,6 +89,18 @@ def _set_inputs(self, inputs):
  def output_limit(self):
  return self._output_limit or 1
 
+ @staticmethod
+ def _apply_fillna_with_method(df, value, method, axis, limit, inplace=False):
+ """
+ Parameter method is deprecated since version 2.1.0, use ffill or bfill instead.
+ """
+ if method is not None:
+ if method in ["backfill", "bfill"]:
+ return df.bfill(axis=axis, limit=limit, inplace=inplace)
+ elif method in ["pad", "ffill"]:
+ return df.ffill(axis=axis, limit=limit, inplace=inplace)
+ return df.fillna(value=value, axis=axis, inplace=inplace)
+
  @staticmethod
  def _get_first_slice(op, df, end):
  if op.method == "bfill":
@@ -115,11 +127,7 @@ def _execute_map(cls, ctx, op):
  axis = op.axis
  method = op.method
 
- filled = input_data.fillna(
- method=method,
- axis=axis,
- limit=limit,
- )
+ filled = cls._apply_fillna_with_method(input_data, None, method, axis, limit)
  ctx[op.outputs[0].key] = cls._get_first_slice(op, filled, 1)
  del filled
 
@@ -137,15 +145,17 @@ def _execute_combine(cls, ctx, op):
  summaries = [ctx[inp.key] for inp in op.inputs[1:]]
 
  if not summaries:
- ctx[op.outputs[0].key] = input_data.fillna(
- method=method,
- axis=axis,
- limit=limit,
+ ctx[op.outputs[0].key] = cls._apply_fillna_with_method(
+ input_data, None, method, axis, limit
  )
  return
 
  valid_summary = cls._get_first_slice(
- op, pd.concat(summaries, axis=axis).fillna(method=method, axis=axis), 1
+ op,
+ cls._apply_fillna_with_method(
+ pd.concat(summaries, axis=axis), None, method, axis, limit
+ ),
+ 1,
  )
 
  if method == "bfill":
@@ -154,17 +164,12 @@ def _execute_combine(cls, ctx, op):
  concat_df = pd.concat([valid_summary, input_data], axis=axis)
 
  if is_pandas_2():
- concat_df = concat_df.fillna(
- method=method,
- axis=axis,
- limit=limit,
+ concat_df = cls._apply_fillna_with_method(
+ concat_df, None, method, axis, limit
  )
  else:
- concat_df.fillna(
- method=method,
- axis=axis,
- inplace=True,
- limit=limit,
+ concat_df = cls._apply_fillna_with_method(
+ concat_df, None, method, axis, limit, inplace=True
  )
  ctx[op.outputs[0].key] = cls._get_first_slice(op, concat_df, -1)
 
@@ -180,11 +185,8 @@ def execute(cls, ctx, op):
  if isinstance(op.value, ENTITY_TYPE):
  value = ctx[op.value.key]
  if not isinstance(input_data, pd.Index):
- ctx[op.outputs[0].key] = input_data.fillna(
- value=value,
- method=op.method,
- axis=op.axis,
- limit=op.limit,
+ ctx[op.outputs[0].key] = cls._apply_fillna_with_method(
+ input_data, value, op.method, op.axis, op.limit
  )
  else:
  ctx[op.outputs[0].key] = input_data.fillna(value=value)

diff --git a/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py b/python/xorbits/_mars/dataframe/missing/tests/test_missing_execution.py
@@ -84,11 +84,11 @@ def test_dataframe_fill_na_execution(setup):
 
  # test forward fill in axis=0 without limit
  r = df.fillna(method="pad")
- pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(method="pad"))
+ pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.ffill())
 
  # test backward fill in axis=0 without limit
  r = df.fillna(method="backfill")
- pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.fillna(method="backfill"))
+ pd.testing.assert_frame_equal(r.execute().fetch(), df_raw.bfill())
 
  # test forward fill in axis=1 without limit
  r = df.ffill(axis=1)