diff --git a/.github/ISSUE_TEMPLATE/other.md b/.github/ISSUE_TEMPLATE/other.md
index 0c143e89a..02357a920 100644
--- a/.github/ISSUE_TEMPLATE/other.md
+++ b/.github/ISSUE_TEMPLATE/other.md
@@ -7,6 +7,4 @@ assignees: ''
---
-Note that the issue tracker is NOT the place for general support. For
-discussions about development, questions about usage, or any general questions,
-contact us on https://discuss.xorbits.io/.
+Note that the issue tracker is NOT the place for general support.
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index e420b2f85..5f6b6547d 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -90,8 +90,8 @@ jobs:
- { os: ubuntu-latest, module: learn, python-version: 3.9 }
- { os: ubuntu-latest, module: mars-core, python-version: 3.9 }
- { os: ubuntu-20.04, module: hadoop, python-version: 3.9 }
- - { os: ubuntu-latest, module: vineyard, python-version: 3.9 }
- - { os: ubuntu-latest, module: external-storage, python-version: 3.9 }
+ - { os: ubuntu-latest, module: vineyard, python-version: 3.11 }
+ - { os: ubuntu-latest, module: external-storage, python-version: 3.11 }
# always test compatibility with the latest version
# - { os: ubuntu-latest, module: compatibility, python-version: 3.9 }
- { os: ubuntu-latest, module: doc-build, python-version: 3.9 }
@@ -131,7 +131,9 @@ jobs:
- name: Install ucx dependencies
if: ${{ (matrix.module != 'gpu') && (matrix.os == 'ubuntu-latest') && (matrix.module != 'doc-build') }}
run: |
- conda install -c conda-forge -c rapidsai ucx-proc=*=cpu ucx ucx-py
+ # ucx-py move to ucxx and ucxx-cu12 can be run on CPU
+ # conda install -c conda-forge -c rapidsai ucx-proc=*=cpu ucx ucx-py
+ pip install ucxx-cu12
- name: Install libomp (macOS)
if: ${{ matrix.os == 'macos-latest' || matrix.os == 'macos-13' }}
run: brew install libomp
@@ -275,7 +277,7 @@ jobs:
run: |
source activate ${{ env.CONDA_ENV }}
pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12==24.8.*
- pip install ucx-py-cu12 cython "numpy>=1.14.0,<2.0.0" cloudpickle scikit-learn \
+ pip install ucxx-cu12 cython "numpy>=1.14.0,<2.0.0" cloudpickle scikit-learn \
pyyaml psutil tornado sqlalchemy defusedxml tqdm uvloop coverage \
pytest pytest-cov pytest-timeout pytest-forked pytest-asyncio pytest-mock
pip install -U xoscar
diff --git a/README.md b/README.md
index bef6932d4..3800786f8 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,12 @@
-
+
[![PyPI Latest Release](https://img.shields.io/pypi/v/xorbits.svg?style=for-the-badge)](https://pypi.org/project/xorbits/)
[![License](https://img.shields.io/pypi/l/xorbits.svg?style=for-the-badge)](https://github.com/xorbitsai/xorbits/blob/main/LICENSE)
[![Coverage](https://img.shields.io/codecov/c/github/xorbitsai/xorbits?style=for-the-badge)](https://codecov.io/gh/xorbitsai/xorbits)
[![Build Status](https://img.shields.io/github/actions/workflow/status/xorbitsai/xorbits/python.yaml?branch=main&style=for-the-badge&label=GITHUB%20ACTIONS&logo=github)](https://actions-badge.atrox.dev/xorbitsai/xorbits/goto?ref=main)
-[![Doc](https://readthedocs.org/projects/xorbits/badge/?version=latest&style=for-the-badge)](https://doc.xorbits.io)
+[![Doc](https://readthedocs.org/projects/xorbits/badge/?version=latest&style=for-the-badge)](https://xorbits.readthedocs.io/)
[![Slack](https://img.shields.io/badge/join_Slack-781FF5.svg?logo=slack&style=for-the-badge)](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg)
[![Twitter](https://img.shields.io/twitter/follow/xorbitsio?logo=twitter&style=for-the-badge)](https://twitter.com/xorbitsio)
@@ -16,7 +16,7 @@ Xorbits is an open-source computing framework that makes it easy to scale data s
from data preprocessing to tuning, training, and model serving. Xorbits can leverage multi-cores or GPUs to accelerate
computation on a single machine or scale out up to thousands of machines to support processing terabytes of data and training or serving large models.
-Xorbits provides a suite of best-in-class [libraries](https://doc.xorbits.io/en/latest/libraries/index.html) for data
+Xorbits provides a suite of best-in-class [libraries](https://xorbits.readthedocs.io/en/latest/libraries/index.html) for data
scientists and machine learning practitioners. Xorbits provides the capability to scale tasks without the necessity for
extensive knowledge of infrastructure.
@@ -40,15 +40,15 @@ You can keep using your existing notebooks and still enjoy a significant speed b
### Process large datasets that pandas can't
-Xorbits can [leverage all of your computational cores](https://doc.xorbits.io/en/latest/getting_started/why_xorbits/pandas.html#boosting-performance-and-scalability-with-xorbits).
-It is especially beneficial for handling [larger datasets](https://doc.xorbits.io/en/latest/getting_started/why_xorbits/pandas.html#overcoming-memory-limitations-in-large-datasets-with-xorbits),
+Xorbits can [leverage all of your computational cores](https://xorbits.readthedocs.io/en/latest/getting_started/why_xorbits/pandas.html#boosting-performance-and-scalability-with-xorbits).
+It is especially beneficial for handling [larger datasets](https://xorbits.readthedocs.io/en/latest/getting_started/why_xorbits/pandas.html#overcoming-memory-limitations-in-large-datasets-with-xorbits),
where pandas may slow down or run out of memory.
### Lightning-fast speed
According to our benchmark tests, Xorbits surpasses other popular pandas API frameworks in speed and scalability.
-See our [performance comparison](https://doc.xorbits.io/en/latest/getting_started/why_xorbits/comparisons.html#performance-comparison)
-and [explanation](https://doc.xorbits.io/en/latest/getting_started/why_xorbits/fast.html).
+See our [performance comparison](https://xorbits.readthedocs.io/en/latest/getting_started/why_xorbits/comparisons.html#performance-comparison)
+, [explanation](https://xorbits.readthedocs.io/en/latest/getting_started/why_xorbits/fast.html) and [research paper](https://arxiv.org/abs/2401.00865).
### Leverage the Python ecosystem with native integrations
@@ -66,10 +66,9 @@ pip install xorbits
```
## Other resources
-* [Documentation](https://doc.xorbits.io)
-* [Examples and Tutorials](https://doc.xorbits.io/en/latest/getting_started/examples.html)
-* [Performance Benchmarks](https://xorbits.io/benchmark)
-* [Development Guide](https://doc.xorbits.io/en/latest/development/index.html)
+* [Documentation](https://xorbits.readthedocs.io)
+* [Performance Benchmarks](https://xorbits.readthedocs.io/en/latest/getting_started/why_xorbits/comparisons.html#performance-comparison)
+* [Development Guide](https://xorbits.readthedocs.io/en/latest/development/index.html)
* [Research Paper on Xorbits' Internals](https://arxiv.org/abs/2401.00865)
## License
@@ -97,24 +96,24 @@ with other upcoming ones we will propose. Stay tuned!
| Platform | Purpose |
|-----------------------------------------------------------------------------------------------|----------------------------------------------------|
-| [Discourse Forum](https://discuss.xorbits.io) | Asking usage questions and discussing development. |
| [Github Issues](https://github.com/xorbitsai/xorbits/issues) | Reporting bugs and filing feature requests. |
-| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | Collaborating with other Xorbits users. |
| [StackOverflow](https://stackoverflow.com/questions/tagged/xorbits) | Asking questions about how to use Xorbits. |
-| [Twitter](https://twitter.com/xorbitsio) | Staying up-to-date on new features. |
+| [Slack](https://join.slack.com/t/xorbitsio/shared_invite/zt-1o3z9ucdh-RbfhbPVpx7prOVdM1CAuxg) | Collaborating with other Xorbits users. |
## Citing Xorbits
-If Xorbits could help you, please cite our paper which is accepted by ICDE 2024 Industry and Applications Track:
+If Xorbits could help you, please cite our paper using the following metadata:
```
-@article{lu2023xorbits,
- title={Xorbits: Automating Operator Tiling for Distributed Data Science},
- author={Weizheng Lu and Kaisheng He and Xuye Qin and Chengjie Li and Zhong Wang and Tao Yuan and Feng Zhang and Yueguo Chen and Xiaoyong Du},
- year={2023},
- archivePrefix={arXiv},
- url={https://doi.org/10.48550/arXiv.2401.00865},
- eprinttype={arXiv},
- eprint={2401.00865},
+@inproceedings{lu2024Xorbits,
+ title = {Xorbits: Automating Operator Tiling for Distributed Data Science},
+ shorttitle = {Xorbits},
+ booktitle = {2024 {{IEEE}} 40th {{International Conference}} on {{Data Engineering}} ({{ICDE}})},
+ author = {Lu, Weizheng and He, Kaisheng and Qin, Xuye and Li, Chengjie and Wang, Zhong and Yuan, Tao and Liao, Xia and Zhang, Feng and Chen, Yueguo and Du, Xiaoyong},
+ year = {2024},
+ month = may,
+ pages = {5211--5223},
+ issn = {2375-026X},
+ doi = {10.1109/ICDE60146.2024.00392},
}
```
\ No newline at end of file
diff --git a/doc/source/development/contributing_documentation.rst b/doc/source/development/contributing_documentation.rst
index c2fc4ec1c..eee1afa0c 100644
--- a/doc/source/development/contributing_documentation.rst
+++ b/doc/source/development/contributing_documentation.rst
@@ -164,4 +164,4 @@ Building main branch documentation
When pull requests are merged into the Xorbits ``main`` branch, the main parts of
the documentation are also built by readthedocs. These docs are then hosted `here
-`__.
\ No newline at end of file
+`__.
\ No newline at end of file
diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst
index e5384b756..f854d90a2 100644
--- a/doc/source/development/contributing_environment.rst
+++ b/doc/source/development/contributing_environment.rst
@@ -63,9 +63,6 @@ If no C compiler is installed, or you wish to upgrade, or you're using a differe
Linux distribution, consult your favorite search engine for compiler installation/update
instructions.
-Let us know if you have any difficulties by opening an issue or reaching out on our contributor
-community, join slack in `Community `_.
-
Step 2: install Node.js
-----------------------
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 63913a4b2..d5fb49daa 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -20,9 +20,7 @@ the capability to scale tasks without the necessity for extensive knowledge of i
- :ref:`Xorbits Train `: Train your own state-of-the-art models for ML and DL frameworks such as PyTorch, XGBoost, etc.
-- :ref:`Xorbits Tune `: Finetune your models by running state of the art algorithms such as PEFT.
-
-- :ref:`Xorbits Inference `: Scalable serving to deploy state-of-the-art models. Integrate with the most popular deep learning libraries, like PyTorch, ggml, etc.
+- `Xorbits Inference `_: Scalable serving to deploy state-of-the-art models. Integrate with the most popular deep learning libraries, like PyTorch, ggml, etc.
Xorbits features a familiar Python API that supports a variety of libraries, including pandas, NumPy, scikit-learn, PyTorch,
XGBoost, Xarray, etc. With a simple modification of just one line of code, your pandas workflow can be seamlessly scaled using
@@ -75,3 +73,6 @@ Getting involved
user_guide/index
reference/index
development/index
+
+
+.. _xorbits_inference_index: https://github.com/xorbitsai/inference
\ No newline at end of file
diff --git a/doc/source/libraries/xorbits_data/index.rst b/doc/source/libraries/xorbits_data/index.rst
index 902c7e435..8fcc58c9e 100644
--- a/doc/source/libraries/xorbits_data/index.rst
+++ b/doc/source/libraries/xorbits_data/index.rst
@@ -8,20 +8,9 @@ Xorbits Data
Xorbits Data is a scalable data processing library for ML and DS workloads.
-Xorbits Data can leverage multi cores or GPUs to accelerate computation on a single machine,
+Xorbits Data can leverage multi cores or GPUs to accelerate computation on a single machine,
or scale out up to thousands of machines to support processing terabytes of data.
-All APIs in Xorbits Data library implemented or planned include:
-
-======================================= =========================================================
-API Implemented version or plan
-======================================= =========================================================
-:ref:`xorbits.pandas ` v0.1.0
-:ref:`xorbits.numpy ` v0.1.0
-:ref:`xorbits.xarray` Planned in the future
-======================================= =========================================================
-
-
.. toctree::
:maxdepth: 2
diff --git a/doc/source/libraries/xorbits_data/numpy.rst b/doc/source/libraries/xorbits_data/numpy.rst
index 9b18d6713..2852d2079 100644
--- a/doc/source/libraries/xorbits_data/numpy.rst
+++ b/doc/source/libraries/xorbits_data/numpy.rst
@@ -118,8 +118,8 @@ elements that we want, instead of the step::
>>> f = np.sin(x)
-However, the way of loading and saving arrays is quite different. Please see :ref:`io ` for
-detailed info. Here's an example of creating and loading an HDF5 file::
+However, the way of loading and saving arrays is quite different. Here's an example of creating
+and loading an HDF5 file::
>>> import h5py # if you don't have h5py installed, run "pip install h5py" first
>>> arr = np.random.randn(1000)
diff --git a/doc/source/libraries/xorbits_inference/index.rst b/doc/source/libraries/xorbits_inference/index.rst
deleted file mode 100644
index a0fea37b0..000000000
--- a/doc/source/libraries/xorbits_inference/index.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-.. _xorbits_inference_index:
-
-=================
-Xorbits Inference
-=================
-
-.. toctree::
- :maxdepth: 2
-
-
-
-Coming soon
\ No newline at end of file
diff --git a/doc/source/libraries/xorbits_train/index.rst b/doc/source/libraries/xorbits_train/index.rst
index 06e79c2a9..a07d6ad90 100644
--- a/doc/source/libraries/xorbits_train/index.rst
+++ b/doc/source/libraries/xorbits_train/index.rst
@@ -10,19 +10,6 @@ Xorbits Train
Xorbits Train scales model training for popular ML and DL frameworks such as PyTorch, XGBoost, scikit-learn, etc.
-
-All APIs in Xorbits Train library implemented or planned include:
-
-======================================= =========================================================
-API Implemented version or plan
-======================================= =========================================================
-:ref:`xorbits.xgboost ` v0.4.0
-:ref:`xorbits.lightgbm ` v0.4.0
-``xorbits.sklearn`` Planned in the near future
-``xorbits.scipy`` Planned in the future
-``xorbits.statsmodels`` Planned in the future
-======================================= =========================================================
-
.. toctree::
:maxdepth: 2
diff --git a/doc/source/libraries/xorbits_tune/index.rst b/doc/source/libraries/xorbits_tune/index.rst
deleted file mode 100644
index 057fd4a6e..000000000
--- a/doc/source/libraries/xorbits_tune/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-.. _xorbits_tune_index:
-
-=============
-Xorbits Tune
-=============
-
-.. toctree::
- :maxdepth: 2
-
-
-Coming soon
\ No newline at end of file
diff --git a/doc/source/reference/index.rst b/doc/source/reference/index.rst
index 7833a82a9..29036da3b 100644
--- a/doc/source/reference/index.rst
+++ b/doc/source/reference/index.rst
@@ -9,6 +9,4 @@ API Reference
xorbits/index
datasets/index
- xgboost/index
- lightgbm/index
experimental/index
\ No newline at end of file
diff --git a/doc/source/reference/lightgbm/learning.rst b/doc/source/reference/lightgbm/learning.rst
index f363b72e7..c23fc06d6 100644
--- a/doc/source/reference/lightgbm/learning.rst
+++ b/doc/source/reference/lightgbm/learning.rst
@@ -6,7 +6,7 @@ Learning API
.. currentmodule:: xorbits.lightgbm
.. autosummary::
- :toctree: _generate/
+ :toctree: generated/
predict
predict_proba
\ No newline at end of file
diff --git a/doc/source/reference/lightgbm/sklearn.rst b/doc/source/reference/lightgbm/sklearn.rst
index 9fc36acf0..a226f5e9c 100644
--- a/doc/source/reference/lightgbm/sklearn.rst
+++ b/doc/source/reference/lightgbm/sklearn.rst
@@ -11,7 +11,7 @@ LGBMClassifier
Constructor
~~~~~~~~~~~
.. autosummary::
- :toctree: _generate/
+ :toctree: generated/
LGBMClassifier
@@ -19,7 +19,7 @@ Constructor
Attributes
~~~~~~~~~~
.. autosummary::
- :toctree: _generate/
+ :toctree: generated/
LGBMClassifier.fit
LGBMClassifier.get_params
@@ -37,7 +37,7 @@ LGBMRegressor
Constructor
~~~~~~~~~~~
.. autosummary::
- :toctree: _generate/
+ :toctree: generated/
LGBMRegressor
@@ -45,7 +45,7 @@ Constructor
Attributes
~~~~~~~~~~
.. autosummary::
- :toctree: _generate/
+ :toctree: generated/
LGBMRegressor.fit
LGBMRegressor.get_params
@@ -63,7 +63,7 @@ LGBMRanker
Constructor
~~~~~~~~~~~
.. autosummary::
- :toctree: _generate/
+ :toctree: generated/
LGBMRanker
@@ -71,7 +71,7 @@ Constructor
Attributes
~~~~~~~~~~
.. autosummary::
- :toctree: _generate/
+ :toctree: generated/
LGBMRanker.fit
LGBMRanker.get_params
diff --git a/doc/source/user_guide/best_practices.rst b/doc/source/user_guide/best_practices.rst
index 6f4a9eaa7..5ba2f82b2 100644
--- a/doc/source/user_guide/best_practices.rst
+++ b/doc/source/user_guide/best_practices.rst
@@ -12,3 +12,4 @@ practices, and helps users solve some common problems.
:maxdepth: 2
loading_data
+ storage_backend
diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst
index e420f6ee2..ffb01b85e 100644
--- a/doc/source/user_guide/index.rst
+++ b/doc/source/user_guide/index.rst
@@ -18,6 +18,6 @@ Further information on any specific method can be obtained in the
:maxdepth: 2
deferred_execution
- best_practices
deployment
+ best_practices
logging
diff --git a/doc/source/user_guide/loading_data.rst b/doc/source/user_guide/loading_data.rst
index ef4758681..0215a276b 100644
--- a/doc/source/user_guide/loading_data.rst
+++ b/doc/source/user_guide/loading_data.rst
@@ -1,10 +1,10 @@
.. _loading_data:
==============
-Loading data
+Loading Data
==============
-Recommended data formats
+Recommended Data Formats
-------------------------
Xorbits supports reading data from various data sources, including csv, parquet, sql, xml and other data formats,
diff --git a/doc/source/user_guide/storage_backend.rst b/doc/source/user_guide/storage_backend.rst
new file mode 100644
index 000000000..d9fcca4f9
--- /dev/null
+++ b/doc/source/user_guide/storage_backend.rst
@@ -0,0 +1,105 @@
+.. _storage_backend:
+
+===============
+Storage Backend
+===============
+
+Xorbits is a distributed computing system that utilize computation graph to decompose big data.
+The smallest unit in a computation graph is a subtask graph. Executing a subtask graph is calling
+single-machine numpy or pandas, and the generated intermediate data is stored in memory by default.
+The backend used for storing this intermediate data is called the **Storage Backend**.
+
+Specifically, the default memory storage backend is called shared memory. Besides shared memory,
+there are several other types of storage backends: GPU, filesystem, mmap, etc.
+
+Shared Memory
+-------------
+
+Shared memory is the default storage backend in Xorbits. It is fast and efficient when the data size
+can fit into the memory of a computing node. Share memory means that the data can be accessed by one
+or more processes on a multicore or symmetric multiprocessor (SMP) machine. We implemented this backend
+using Python 3's `multiprocessing.shared_memory `_.
+In most scenarios, shared memory performs well. We don't need to switch to other storage backends.
+
+Filesystem
+----------
+
+Usually, the available capacity of the filesystem is much larger than that of RAM.
+When your RAM is not large enough, you can use the filesystem to store these intermediate results.
+The filesystem can be further divided into two categories: local and distributed.
+Examples of local filesystem are local disk or mmap; and distributed filesystems are systems like
+JuiceFS and Alluxio.
+
+Local
+^^^^^
+
+Suppose you want to use the `/tmp` directory on your local disk as the storage backend. You should
+create a YAML configuration file named `file.yml` which specify `backends` and `root_dirs`.
+
+.. code-block:: yaml
+
+ "@inherits": "@default"
+ storage:
+ backends: [disk]
+ disk:
+ root_dirs: "/tmp"
+
+Start the worker using the :code:`-f file.yml` option:
+
+.. code-block:: bash
+
+ xorbits-worker -H -p -s : -f file.yml
+
+
+mmap
+^^^^
+
+mmap (memory-mapped file) is a technique that maps a filesystem file or a device into RAM. Pandas may
+encounter OOM(Out-of-Memory) issues when processing large datasets. Xorbits' mmap storage backend enables
+users to handle datasets much larger than the available RAM capacity. Basically, Xorbits's mmap controls
+the amount of data in runtime memory at a stable level, loading data from disk when necessary. Therefore,
+mmap can handle datasets that are sizes of the available disk space.
+
+Xorbits's mmap now can run on a single-node setup. Just initialize in the :code:`xorbits.init()` method like
+and specify :code:`root_dirs` to a disk file path:
+
+.. code-block:: python
+
+ import xorbits
+ xorbits.init(storage_config={"mmap": {"root_dirs": ""}})
+
+
+
+Distributed Filesystems
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Both Alluxio and JuiceFS provide a FUSE (Filesystem in Userspace) interface, enabling users to access
+large-scale data using standard POSIX interfaces. They allow users to interact with distributed data
+as if it is on a local filesystem path like `/path/to/data`. When a user reads from or writes to a local
+path, the data is first cached in memory, and then persisted in a remote underlying big data storage engine,
+such as HDFS or S3.
+
+Suppose you mount Alluxio or JuiceFS on `/mnt/xorbits`. You can write a YAML file just like the local filesystem
+and start the worker by adding :code:`-f file.yml` option.
+
+.. code-block:: yaml
+
+ "@inherits": "@default"
+ storage:
+ backends: [disk]
+ disk:
+ root_dirs: "/mnt/xorbits"
+
+
+GPU
+---
+
+If you want to run tasks on GPUs, add the :code:`gpu=True` parameter to the data loading method. For example:
+
+.. code-block:: python
+
+ import xorbits.pandas as pd
+ pd.read_parquet(path, gpu=True)
+
+
+All subsequent operations will run on GPUs.
\ No newline at end of file
diff --git a/python/setup.cfg b/python/setup.cfg
index ca085787c..045cb0292 100644
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -77,7 +77,6 @@ doc =
pydata-sphinx-theme>=0.3.0
sphinx-intl>=0.9.9
matplotlib
- datasets
extra =
pillow>=7.0.0
lz4>=1.0.0
diff --git a/python/xorbits/_mars/deploy/oscar/cmdline.py b/python/xorbits/_mars/deploy/oscar/cmdline.py
index 4d85f9dfd..c8b9823d5 100644
--- a/python/xorbits/_mars/deploy/oscar/cmdline.py
+++ b/python/xorbits/_mars/deploy/oscar/cmdline.py
@@ -134,8 +134,8 @@ def parse_args(self, parser, argv, environ=None):
if args.endpoint is not None and args.host is not None: # pragma: no cover
raise ValueError("Cannot specify host and endpoint at the same time")
- if "MARS_TASK_DETAIL" in environ:
- task_detail = json.loads(environ["MARS_TASK_DETAIL"])
+ if "XORBITS_TASK_DETAIL" in environ:
+ task_detail = json.loads(environ["XORBITS_TASK_DETAIL"])
task_type, task_index = (
task_detail["task"]["type"],
task_detail["task"]["index"],
diff --git a/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py b/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py
index 90fb2cb96..068111ac8 100644
--- a/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py
+++ b/python/xorbits/_mars/deploy/oscar/tests/test_cmdline.py
@@ -285,11 +285,11 @@ def test_parse_args():
"""
env = {
- "MARS_LOAD_MODULES": "extra.module",
- "MARS_TASK_DETAIL": task_detail,
- "MARS_CACHE_MEM_SIZE": "20M",
- "MARS_PLASMA_DIRS": "/dev/shm",
- "MARS_SPILL_DIRS": "/tmp",
+ "XORBITS_LOAD_MODULES": "extra.module",
+ "XORBITS_TASK_DETAIL": task_detail,
+ "XORBITS_CACHE_MEM_SIZE": "20M",
+ "XORBITS_PLASMA_DIRS": "/dev/shm",
+ "XORBITS_SPILL_DIRS": "/tmp",
}
args = app.parse_args(parser, ["-p", "10324"], env)
assert args.host == "worker1"
@@ -366,7 +366,7 @@ def test_parse_third_party_modules():
"worker": ["worker.module"],
}
}
- env = {"MARS_LOAD_MODULES": "extra.module"}
+ env = {"XORBITS_LOAD_MODULES": "extra.module"}
parser = argparse.ArgumentParser(description="TestService")
app = WorkerCommandRunner()
diff --git a/python/xorbits/_mars/deploy/oscar/worker.py b/python/xorbits/_mars/deploy/oscar/worker.py
index 47c0b6e0a..81c77b873 100644
--- a/python/xorbits/_mars/deploy/oscar/worker.py
+++ b/python/xorbits/_mars/deploy/oscar/worker.py
@@ -24,7 +24,7 @@
class WorkerCommandRunner(OscarCommandRunner):
- command_description = "Mars Worker"
+ command_description = "Xorbits Worker"
node_role = NodeRole.WORKER
def __init__(self):
@@ -55,7 +55,7 @@ def parse_args(self, parser, argv, environ=None):
self.config.get("cluster", {}).get("backend", "fixed") == "fixed"
and not args.supervisors
): # pragma: no cover
- raise ValueError("--supervisors is needed to start Mars Worker")
+ raise ValueError("--supervisors is needed to start Xorbits Worker")
args.supervisors = self._process_supervisors_addr_scheme(args.supervisors)
@@ -86,13 +86,13 @@ def parse_args(self, parser, argv, environ=None):
backends = storage_config["backends"] = storage_config.get("backends", [])
plasma_config = storage_config["plasma"] = storage_config.get("plasma", {})
disk_config = storage_config["disk"] = storage_config.get("disk", {})
- if "MARS_CACHE_MEM_SIZE" in environ:
- plasma_config["store_memory"] = environ["MARS_CACHE_MEM_SIZE"]
- if "MARS_PLASMA_DIRS" in environ:
- plasma_config["plasma_directory"] = environ["MARS_PLASMA_DIRS"]
- if "MARS_SPILL_DIRS" in environ:
+ if "XORBITS_CACHE_MEM_SIZE" in environ:
+ plasma_config["store_memory"] = environ["XORBITS_CACHE_MEM_SIZE"]
+ if "XORBITS_PLASMA_DIRS" in environ:
+ plasma_config["plasma_directory"] = environ["XORBITS_PLASMA_DIRS"]
+ if "XORBITS_SPILL_DIRS" in environ:
backends.append("disk")
- disk_config["root_dirs"] = environ["MARS_SPILL_DIRS"]
+ disk_config["root_dirs"] = environ["XORBITS_SPILL_DIRS"]
return args
diff --git a/python/xorbits/_mars/deploy/tests/test_utils.py b/python/xorbits/_mars/deploy/tests/test_utils.py
index 7e6f8dce8..76a522153 100644
--- a/python/xorbits/_mars/deploy/tests/test_utils.py
+++ b/python/xorbits/_mars/deploy/tests/test_utils.py
@@ -71,7 +71,7 @@ def test_get_third_party_modules_from_config():
r = get_third_party_modules_from_config(config, NodeRole.WORKER)
assert r == ["ab.module"]
- os.environ["MARS_LOAD_MODULES"] = "c.module,d.module"
+ os.environ["XORBITS_LOAD_MODULES"] = "c.module,d.module"
try:
r = get_third_party_modules_from_config(config, NodeRole.SUPERVISOR)
assert r == ["ab.module", "c.module", "d.module"]
@@ -82,7 +82,7 @@ def test_get_third_party_modules_from_config():
r = get_third_party_modules_from_config({}, NodeRole.WORKER)
assert r == ["c.module", "d.module"]
finally:
- os.environ.pop("MARS_LOAD_MODULES", None)
+ os.environ.pop("XORBITS_LOAD_MODULES", None)
config = {"third_party_modules": "ab.module"}
with pytest.raises(TypeError, match="str"):
diff --git a/python/xorbits/_mars/deploy/utils.py b/python/xorbits/_mars/deploy/utils.py
index 979e19c1a..f787be88c 100644
--- a/python/xorbits/_mars/deploy/utils.py
+++ b/python/xorbits/_mars/deploy/utils.py
@@ -212,7 +212,7 @@ def get_third_party_modules_from_config(config: Dict, role: NodeRole, environ=No
)
all_modules = []
- for mods in tuple(modules or ()) + (environ.get("MARS_LOAD_MODULES"),):
+ for mods in tuple(modules or ()) + (environ.get("XORBITS_LOAD_MODULES"),):
all_modules.extend(mods.split(",") if mods else [])
return all_modules
diff --git a/python/xorbits/_mars/deploy/yarn/config.py b/python/xorbits/_mars/deploy/yarn/config.py
index fdefd3ff3..268040ed3 100644
--- a/python/xorbits/_mars/deploy/yarn/config.py
+++ b/python/xorbits/_mars/deploy/yarn/config.py
@@ -209,7 +209,7 @@ def add_default_envs(self):
self.add_env("MARS_MEMORY_TOTAL", str(int(self._memory)))
if self._modules:
- self.add_env("MARS_LOAD_MODULES", ",".join(self._modules))
+ self.add_env("XORBITS_LOAD_MODULES", ",".join(self._modules))
if self._path_environ:
self.add_env("MARS_YARN_PATH", self._path_environ)
diff --git a/python/xorbits/_mars/deploy/yarn/tests/test_config.py b/python/xorbits/_mars/deploy/yarn/tests/test_config.py
index 1a8045e89..569862ab7 100644
--- a/python/xorbits/_mars/deploy/yarn/tests/test_config.py
+++ b/python/xorbits/_mars/deploy/yarn/tests/test_config.py
@@ -64,7 +64,7 @@ def test_supervisor_config():
assert config_envs["MKL_NUM_THREADS"] == "2"
assert config_envs["MARS_CPU_TOTAL"] == "2"
assert int(config_envs["MARS_MEMORY_TOTAL"]) == 10 * 1024**3
- assert config_envs["MARS_LOAD_MODULES"] == "mars.test_mod"
+ assert config_envs["XORBITS_LOAD_MODULES"] == "mars.test_mod"
config = MarsSupervisorConfig(
"conda://path/to_env",
diff --git a/python/xorbits/_mars/storage/filesystem.py b/python/xorbits/_mars/storage/filesystem.py
index 3a9cb6da2..9286a0a26 100644
--- a/python/xorbits/_mars/storage/filesystem.py
+++ b/python/xorbits/_mars/storage/filesystem.py
@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
+import logging
import os
import uuid
from typing import Dict, List, Optional, Tuple
@@ -25,6 +26,8 @@
from .base import ObjectInfo, StorageBackend, StorageLevel, register_storage_backend
from .core import StorageFileObject
+logger = logging.getLogger(__name__)
+
@register_storage_backend
class FileSystemStorage(StorageBackend):
@@ -50,6 +53,9 @@ async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
f'FileSystemStorage got unexpected config: {",".join(kwargs)}'
)
+ # `root_dirs` is actually a single directory here
+ # but the interface is designed to accept a list of directories
+ # so we need to split it if it's a single directory
if isinstance(root_dirs, str):
root_dirs = root_dirs.split(":")
if isinstance(level, str):
@@ -157,13 +163,13 @@ class AlluxioStorage(FileSystemStorage):
def __init__(
self,
- root_dir: str,
+ root_dirs: List[str],
local_environ: bool, # local_environ means standalone mode
level: StorageLevel = None,
size: int = None,
):
self._fs = AioFilesystem(LocalFileSystem())
- self._root_dirs = [root_dir]
+ self._root_dirs = root_dirs
self._level = level
self._size = size
self._local_environ = local_environ
@@ -172,29 +178,34 @@ def __init__(
@implements(StorageBackend.setup)
async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
kwargs["level"] = StorageLevel.MEMORY
- root_dir = kwargs.get("root_dir")
+ root_dirs = kwargs.get("root_dirs")
+ # `root_dirs` is actually a single directory here
+ # but the interface is designed to accept a list of directories
+ # so we need to split it if it's a single directory
+ if isinstance(root_dirs, str):
+ root_dirs = root_dirs.split(":")
local_environ = kwargs.get("local_environ")
if local_environ:
proc = await asyncio.create_subprocess_shell(
f"""$ALLUXIO_HOME/bin/alluxio fs mkdir /alluxio-storage
- $ALLUXIO_HOME/integration/fuse/bin/alluxio-fuse mount {root_dir} /alluxio-storage
+ $ALLUXIO_HOME/integration/fuse/bin/alluxio-fuse mount {root_dirs[0]} /alluxio-storage
"""
)
await proc.wait()
params = dict(
- root_dir=root_dir,
+ root_dirs=root_dirs,
level=StorageLevel.MEMORY,
size=None,
local_environ=local_environ,
)
- return params, dict(root_dir=root_dir)
+ return params, dict(root_dirs=root_dirs)
@staticmethod
@implements(StorageBackend.teardown)
async def teardown(**kwargs):
- root_dir = kwargs.get("root_dir")
+ root_dirs = kwargs.get("root_dirs")
proc = await asyncio.create_subprocess_shell(
- f"""$ALLUXIO_HOME/integration/fuse/bin/alluxio-fuse unmount {root_dir} /alluxio-storage
+ f"""$ALLUXIO_HOME/integration/fuse/bin/alluxio-fuse unmount {root_dirs[0]} /alluxio-storage
$ALLUXIO_HOME/bin/alluxio fs rm -R /alluxio-storage
"""
)
@@ -227,6 +238,11 @@ async def setup(cls, **kwargs) -> Tuple[Dict, Dict]:
kwargs["level"] = StorageLevel.MEMORY
in_k8s = kwargs.get("in_k8s")
root_dirs = kwargs.get("root_dirs")
+ # `root_dirs` is actually a single directory here
+ # but the interface is designed to accept a list of directories
+ # so we need to split it if it's a single directory
+ if isinstance(root_dirs, str):
+ root_dirs = root_dirs.split(":")
params = dict(
root_dirs=root_dirs,
level=StorageLevel.MEMORY,
diff --git a/python/xorbits/_mars/storage/tests/test_libs.py b/python/xorbits/_mars/storage/tests/test_libs.py
index 5f6a2454a..478dd0073 100644
--- a/python/xorbits/_mars/storage/tests/test_libs.py
+++ b/python/xorbits/_mars/storage/tests/test_libs.py
@@ -80,7 +80,7 @@ async def storage_context(request):
elif request.param == "alluxio":
tempdir = tempfile.mkdtemp()
params, teardown_params = await AlluxioStorage.setup(
- root_dir=tempdir, local_environ=True
+ root_dirs=[tempdir], local_environ=True
)
storage = AlluxioStorage(**params)
assert storage.level == StorageLevel.MEMORY
@@ -212,8 +212,8 @@ async def test_base_operations(storage_context):
put_info3 = await storage.put(s)
get_data3 = await storage.get(put_info3.object_id)
assert isinstance(get_data3, SparseMatrix)
- np.testing.assert_array_equal(get_data3.toarray(), s1.A)
- np.testing.assert_array_equal(get_data3.todense(), s1.A)
+ np.testing.assert_array_equal(get_data3.toarray(), s1.toarray())
+ np.testing.assert_array_equal(get_data3.todense(), s1.toarray())
@pytest.mark.asyncio
diff --git a/python/xorbits/_mars/utils.py b/python/xorbits/_mars/utils.py
index b24031e7c..da4ed4a3a 100644
--- a/python/xorbits/_mars/utils.py
+++ b/python/xorbits/_mars/utils.py
@@ -1161,7 +1161,7 @@ def is_object_dtype(dtype: np.dtype) -> bool:
try:
return (
np.issubdtype(dtype, np.object_)
- or np.issubdtype(dtype, np.unicode_)
+ or np.issubdtype(dtype, np.str_)
or np.issubdtype(dtype, np.bytes_)
)
except TypeError: # pragma: no cover
diff --git a/python/xorbits/deploy/__init__.py b/python/xorbits/deploy/__init__.py
index 19130d09d..91b04c4b5 100644
--- a/python/xorbits/deploy/__init__.py
+++ b/python/xorbits/deploy/__init__.py
@@ -118,7 +118,7 @@ def init(
if len(backends) > 1 or len(backends) == 0:
raise ValueError("Only support one storage backend.")
backend = backends[0]
- if backend not in ["shared_memory", "mmap"]:
+ if backend not in ["shared_memory", "mmap", "disk"]:
raise ValueError(
"Only support one of these storage backends: `shared_memory`, `mmap`."
)
diff --git a/python/xorbits/deploy/kubernetes/config.py b/python/xorbits/deploy/kubernetes/config.py
index ed517cd1d..c9721d462 100644
--- a/python/xorbits/deploy/kubernetes/config.py
+++ b/python/xorbits/deploy/kubernetes/config.py
@@ -759,7 +759,7 @@ def add_default_envs(self):
self.add_env("MARS_MEMORY_TOTAL", str(int(self._memory)))
if self._modules:
- self.add_env("MARS_LOAD_MODULES", ",".join(self._modules))
+ self.add_env("XORBITS_LOAD_MODULES", ",".join(self._modules))
if self._external_storage == "juicefs":
self.add_env("XORBITS_EXTERNAL_STORAGE", "juicefs")
diff --git a/python/xorbits/deploy/kubernetes/tests/test_config.py b/python/xorbits/deploy/kubernetes/tests/test_config.py
index 955172d44..ad39f3bbc 100644
--- a/python/xorbits/deploy/kubernetes/tests/test_config.py
+++ b/python/xorbits/deploy/kubernetes/tests/test_config.py
@@ -78,7 +78,7 @@ def test_supervisor_object():
assert container_envs["MKL_NUM_THREADS"]["value"] == "2"
assert container_envs["MARS_CPU_TOTAL"]["value"] == "2"
assert int(container_envs["MARS_MEMORY_TOTAL"]["value"]) == 10 * 1024**3
- assert container_envs["MARS_LOAD_MODULES"]["value"] == "xorbits.test_mod"
+ assert container_envs["XORBITS_LOAD_MODULES"]["value"] == "xorbits.test_mod"
supervisor_config = XorbitsSupervisorsConfig(
1,
@@ -126,7 +126,7 @@ def test_worker_object():
assert container_envs["MKL_NUM_THREADS"]["value"] == "2"
assert container_envs["MARS_CPU_TOTAL"]["value"] == "2"
assert int(container_envs["MARS_MEMORY_TOTAL"]["value"]) == 10 * 1024**3
- assert container_envs["MARS_LOAD_MODULES"]["value"] == "xorbits.test_mod"
+ assert container_envs["XORBITS_LOAD_MODULES"]["value"] == "xorbits.test_mod"
assert set(container_envs["MARS_SPILL_DIRS"]["value"].split(":")) == {
"/tmp/empty",
"/mnt/hostpath0",