From 9236460055cab667d22775ec05dd2de8220b0abe Mon Sep 17 00:00:00 2001 From: "Lee, Kin Long Kelvin" Date: Thu, 14 Mar 2024 14:41:00 -0700 Subject: [PATCH 01/16] feat: added xpu module with lightning classes Signed-off-by: Lee, Kin Long Kelvin --- matsciml/lightning/xpu.py | 225 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 matsciml/lightning/xpu.py diff --git a/matsciml/lightning/xpu.py b/matsciml/lightning/xpu.py new file mode 100644 index 00000000..97a2c383 --- /dev/null +++ b/matsciml/lightning/xpu.py @@ -0,0 +1,225 @@ +# Copyright (C) 2023 Intel Corporation +# SPDX-License-Identifier: MIT License +from __future__ import annotations +from datetime import timedelta +from typing import Callable, Union, List, Dict, Any + +from lightning_lite.plugins import CheckpointIO, ClusterEnvironment +from lightning_lite.plugins.collectives.torch_collective import default_pg_timeout +from pytorch_lightning.plugins.precision import ( + PrecisionPlugin, + NativeMixedPrecisionPlugin, +) + +from matsciml.common.packages import package_registry +from matsciml.lightning.ddp import MPIEnvironment +from pytorch_lightning.accelerators import Accelerator, AcceleratorRegistry +from pytorch_lightning.strategies import SingleDeviceStrategy +from pytorch_lightning.strategies.ddp import DDPStrategy +import torch +from torch import distributed as dist + +if package_registry["ipex"]: + import intel_extension_for_pytorch as ipex + + __all__ = ["XPUAccelerator", "SingleXPUStrategy", "DDPXPUStrategy"] + + class XPUAccelerator(Accelerator): + + """ + Implements a Lightning Accelerator class for Intel GPU usage. Depends + on Intel Extension for PyTorch to be installed. + """ + + @staticmethod + def parse_devices(devices: Union[int, List[int]]) -> List[int]: + """ + Parse the `trainer` input for devices and homogenize them. + Parameters + ---------- + devices : Union[int, List[int]] + Single or list of device numbers to use + Returns + ------- + List[int] + List of device numbers to use + """ + if isinstance(devices, int): + devices = [ + devices, + ] + return devices + + def setup_device(self, device: torch.device) -> None: + """ + Configure the current process to use a specified device. + Perhaps unreliably and misguiding, the IPEX implementation of this method + tries to mirror the CUDA version but `ipex.xpu.set_device` actually refuses + to accept anything other than an index. I've tried to work around this + by grabbing the index from the device if possible, and just setting + it to the first device if not using a distributed/multitile setup. + """ + # first try and see if we can grab the index from the device + index = getattr(device, "index", None) + if index is None and not dist.is_initialized(): + index = 0 + torch.xpu.set_device(index) + + def teardown(self) -> None: + # as it suggests, this is run on cleanup + torch.xpu.empty_cache() + + def get_device_stats(self, device) -> Dict[str, Any]: + return torch.xpu.memory_stats(device) + + @staticmethod + def get_parallel_devices(devices: List[int]) -> List[torch.device]: + """ + Return a list of torch devices corresponding to what is available. + Essentially maps indices to `torch.device` objects. + Parameters + ---------- + devices : List[int] + List of integers corresponding to device numbers + Returns + ------- + List[torch.device] + List of `torch.device` objects for each device + """ + return [torch.device("xpu", i) for i in devices] + + @staticmethod + def auto_device_count() -> int: + # by default, PVC has two tiles per GPU + return torch.xpu.device_count() + + @staticmethod + def is_available() -> bool: + """ + Determines if an XPU is actually available. + Returns + ------- + bool + True if devices are detected, otherwise False + """ + try: + return torch.xpu.device_count() != 0 + except (AttributeError, NameError): + return False + + @classmethod + def register_accelerators(cls, accelerator_registry) -> None: + accelerator_registry.register( + "xpu", + cls, + description="Intel Data Center GPU Max - codename Ponte Vecchio", + ) + + # add PVC to the registry + AcceleratorRegistry.register("xpu", XPUAccelerator) + + class SingleXPUStrategy(SingleDeviceStrategy): + + """ + This class implements the strategy for using a single PVC tile. + """ + + strategy_name = "pvc_single" + + def __init__( + self, + device: str | None = "xpu", + checkpoint_io=None, + precision_plugin=None, + ): + super().__init__( + device=device, + accelerator=XPUAccelerator(), + checkpoint_io=checkpoint_io, + precision_plugin=precision_plugin, + ) + + @property + def is_distributed(self) -> bool: + return False + + def setup(self, trainer) -> None: + self.model_to_device() + super().setup(trainer) + + def setup_optimizers(self, trainer) -> None: + super().setup_optimizers(trainer) + + def model_to_device(self) -> None: + self.model.to(self.root_device) + + @classmethod + def register_strategies(cls, strategy_registry) -> None: + strategy_registry.register( + cls.strategy_name, + cls, + description=f"{cls.__class__.__name__} - uses a single XPU tile for compute.", + ) + + class DDPXPUStrategy(DDPStrategy): + """ + Defines a strategy that uses multiple XPU devices with + distributed data parallelism. + """ + + strategy_name = "ddp_with_pvc" + + def __init__( + self, + parallel_devices: List[torch.device] | None = None, + cluster_environment: ClusterEnvironment | None = None, + checkpoint_io: CheckpointIO | None = None, + precision_plugin: PrecisionPlugin | None = None, + ddp_comm_state: object | None = None, + ddp_comm_hook: Callable[..., Any] | None = None, + ddp_comm_wrapper: Callable[..., Any] | None = None, + model_averaging_period: int | None = None, + process_group_backend: str | None = "ccl", + timeout: timedelta | None = default_pg_timeout, + **kwargs: Any, + ) -> None: + accelerator = XPUAccelerator() + if cluster_environment is None: + cluster_environment = MPIEnvironment() + super().__init__( + accelerator, + parallel_devices, + cluster_environment, + checkpoint_io, + precision_plugin, + ddp_comm_state, + ddp_comm_hook, + ddp_comm_wrapper, + model_averaging_period, + process_group_backend, + timeout, + **kwargs, + ) + + @classmethod + def register_strategies(cls, strategy_registry) -> None: + strategy_registry.register( + cls.strategy_name, + cls, + description=f"{cls.__class__.__name__} - uses distributed data parallelism" + " to divide data across multiple XPU tiles.", + ) + + class XPUBF16Plugin(NativeMixedPrecisionPlugin): + def __init__(self): + super().__init__(torch.bfloat16, "xpu") + + def auto_cast_context_manager(self): + """ + Overrides the default behavior, which relies on `torch.amp` where only + CPU and CUDA backends are supported. This uses the `xpu.amp` interface + explicitly, as done in the IPEX documentation. + """ + return torch.xpu.amp.autocast( + self.device, enabled=True, dtype=torch.bfloat16 + ) From 5952ca75f0a225c23eb5a709f5ea7aa364f06760 Mon Sep 17 00:00:00 2001 From: "Lee, Kin Long Kelvin" Date: Thu, 14 Mar 2024 14:43:41 -0700 Subject: [PATCH 02/16] feat: ensuring intel libraries get loaded Signed-off-by: Lee, Kin Long Kelvin --- matsciml/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/matsciml/__init__.py b/matsciml/__init__.py index 5f6d2f63..5eb6edc5 100644 --- a/matsciml/__init__.py +++ b/matsciml/__init__.py @@ -1,3 +1,11 @@ from __future__ import annotations __version__ = "1.1.0" + +# determine if intel libraries are available +from matsciml.common.packages import package_registry + +if package_registry["ipex"]: + import intel_extension_for_pytorch # noqa: F401 +if package_registry["ccl"]: + import oneccl_bindings_for_pytorch # noqa: F401 From 48ed9da60e53f46dcf48f3d11bb7b49ac6b5872a Mon Sep 17 00:00:00 2001 From: "Lee, Kin Long Kelvin" Date: Thu, 14 Mar 2024 14:44:45 -0700 Subject: [PATCH 03/16] refactor: adding ipex import again just in case Signed-off-by: Lee, Kin Long Kelvin --- matsciml/lightning/xpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/matsciml/lightning/xpu.py b/matsciml/lightning/xpu.py index 97a2c383..39b69869 100644 --- a/matsciml/lightning/xpu.py +++ b/matsciml/lightning/xpu.py @@ -20,7 +20,7 @@ from torch import distributed as dist if package_registry["ipex"]: - import intel_extension_for_pytorch as ipex + import intel_extension_for_pytorch as ipex # noqa: F401 __all__ = ["XPUAccelerator", "SingleXPUStrategy", "DDPXPUStrategy"] From 480df2c276f64a44aa275046f6ff45a3f3d38b11 Mon Sep 17 00:00:00 2001 From: "Lee, Kin Long Kelvin" Date: Thu, 14 Mar 2024 14:45:58 -0700 Subject: [PATCH 04/16] script: adding example xpu script Signed-off-by: Lee, Kin Long Kelvin --- examples/devices/xpu_example.py | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 examples/devices/xpu_example.py diff --git a/examples/devices/xpu_example.py b/examples/devices/xpu_example.py new file mode 100644 index 00000000..23ca1671 --- /dev/null +++ b/examples/devices/xpu_example.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import pytorch_lightning as pl + +from matsciml.datasets.transforms import PointCloudToGraphTransform +from matsciml.lightning.data_utils import MatSciMLDataModule +from matsciml.models.base import ScalarRegressionTask +from matsciml.models.pyg import EGNN + +""" +This example script runs through a fast development run of the IS2RE devset +in combination with a PyG implementation of EGNN. +""" + +# construct IS2RE relaxed energy regression with PyG implementation of E(n)-GNN +task = ScalarRegressionTask( + encoder_class=EGNN, + encoder_kwargs={"hidden_dim": 128, "output_dim": 64}, + task_keys=["energy_relaxed"], +) +# matsciml devset for OCP are serialized with DGL - this transform goes between the two frameworks +dm = MatSciMLDataModule.from_devset( + "IS2REDataset", + dset_kwargs={ + "transforms": [ + PointCloudToGraphTransform( + "pyg", + cutoff_dist=20.0, + node_keys=["pos", "atomic_numbers"], + ), + ], + }, +) + +# run a quick training loop on a single XPU device +trainer = pl.Trainer(fast_dev_run=10, strategy="single_pvc") +trainer.fit(task, datamodule=dm) From 47ad0efcf52031144eedee30644a25f0407d2041 Mon Sep 17 00:00:00 2001 From: "Lee, Kin Long Kelvin" Date: Thu, 14 Mar 2024 16:08:29 -0700 Subject: [PATCH 05/16] feat: adding xpu to lightning init Signed-off-by: Lee, Kin Long Kelvin --- matsciml/lightning/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/matsciml/lightning/__init__.py b/matsciml/lightning/__init__.py index 6a5482cb..819c61f8 100644 --- a/matsciml/lightning/__init__.py +++ b/matsciml/lightning/__init__.py @@ -4,5 +4,6 @@ from matsciml.lightning.ddp import * from matsciml.lightning.data_utils import * +from matsciml.lightning.xpu import * __all__ = ["MatSciMLDataModule", "MultiDataModule"] From d634721f17f76fa78440052ea8f4dc681166626d Mon Sep 17 00:00:00 2001 From: "Lee, Kin Long Kelvin" Date: Thu, 14 Mar 2024 16:14:49 -0700 Subject: [PATCH 06/16] fix: correcting reference from lightning_lite Signed-off-by: Lee, Kin Long Kelvin --- matsciml/lightning/xpu.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matsciml/lightning/xpu.py b/matsciml/lightning/xpu.py index 39b69869..584194b9 100644 --- a/matsciml/lightning/xpu.py +++ b/matsciml/lightning/xpu.py @@ -4,8 +4,8 @@ from datetime import timedelta from typing import Callable, Union, List, Dict, Any -from lightning_lite.plugins import CheckpointIO, ClusterEnvironment -from lightning_lite.plugins.collectives.torch_collective import default_pg_timeout +from pytorch_lightning.plugins import CheckpointIO, ClusterEnvironment +from pytorch_lightning.plugins.collectives.torch_collective import default_pg_timeout from pytorch_lightning.plugins.precision import ( PrecisionPlugin, NativeMixedPrecisionPlugin, From eecddfc81189b7a389e3e00b9c40fbc6554b9390 Mon Sep 17 00:00:00 2001 From: "Lee, Kin Long Kelvin" Date: Thu, 14 Mar 2024 16:17:28 -0700 Subject: [PATCH 07/16] fix: implementing default timeout instead of resolving import Signed-off-by: Lee, Kin Long Kelvin --- matsciml/lightning/xpu.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/matsciml/lightning/xpu.py b/matsciml/lightning/xpu.py index 584194b9..f561d32b 100644 --- a/matsciml/lightning/xpu.py +++ b/matsciml/lightning/xpu.py @@ -5,7 +5,6 @@ from typing import Callable, Union, List, Dict, Any from pytorch_lightning.plugins import CheckpointIO, ClusterEnvironment -from pytorch_lightning.plugins.collectives.torch_collective import default_pg_timeout from pytorch_lightning.plugins.precision import ( PrecisionPlugin, NativeMixedPrecisionPlugin, @@ -19,6 +18,8 @@ import torch from torch import distributed as dist +default_pg_timeout = timedelta(seconds=1800) + if package_registry["ipex"]: import intel_extension_for_pytorch as ipex # noqa: F401 From 56eae53c3cea2ab84c49e98ef47f466f6d95f434 Mon Sep 17 00:00:00 2001 From: "Lee, Kin Long Kelvin" Date: Thu, 14 Mar 2024 16:21:21 -0700 Subject: [PATCH 08/16] refactor: removing potentially unneeded precision plugin Signed-off-by: Lee, Kin Long Kelvin --- matsciml/lightning/xpu.py | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/matsciml/lightning/xpu.py b/matsciml/lightning/xpu.py index f561d32b..5162a9a8 100644 --- a/matsciml/lightning/xpu.py +++ b/matsciml/lightning/xpu.py @@ -5,10 +5,7 @@ from typing import Callable, Union, List, Dict, Any from pytorch_lightning.plugins import CheckpointIO, ClusterEnvironment -from pytorch_lightning.plugins.precision import ( - PrecisionPlugin, - NativeMixedPrecisionPlugin, -) +from pytorch_lightning.plugins.precision import Precision from matsciml.common.packages import package_registry from matsciml.lightning.ddp import MPIEnvironment @@ -175,7 +172,7 @@ def __init__( parallel_devices: List[torch.device] | None = None, cluster_environment: ClusterEnvironment | None = None, checkpoint_io: CheckpointIO | None = None, - precision_plugin: PrecisionPlugin | None = None, + precision_plugin: Precision | None = None, ddp_comm_state: object | None = None, ddp_comm_hook: Callable[..., Any] | None = None, ddp_comm_wrapper: Callable[..., Any] | None = None, @@ -210,17 +207,3 @@ def register_strategies(cls, strategy_registry) -> None: description=f"{cls.__class__.__name__} - uses distributed data parallelism" " to divide data across multiple XPU tiles.", ) - - class XPUBF16Plugin(NativeMixedPrecisionPlugin): - def __init__(self): - super().__init__(torch.bfloat16, "xpu") - - def auto_cast_context_manager(self): - """ - Overrides the default behavior, which relies on `torch.amp` where only - CPU and CUDA backends are supported. This uses the `xpu.amp` interface - explicitly, as done in the IPEX documentation. - """ - return torch.xpu.amp.autocast( - self.device, enabled=True, dtype=torch.bfloat16 - ) From 9881ee57c644f11cd3a1c11e3158c354e7350399 Mon Sep 17 00:00:00 2001 From: Kin Long Kelvin Lee Date: Thu, 14 Mar 2024 17:13:24 -0700 Subject: [PATCH 09/16] feat: adding xpu strategies to registry Signed-off-by: Kin Long Kelvin Lee --- matsciml/lightning/xpu.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/matsciml/lightning/xpu.py b/matsciml/lightning/xpu.py index 5162a9a8..cc09419c 100644 --- a/matsciml/lightning/xpu.py +++ b/matsciml/lightning/xpu.py @@ -10,7 +10,7 @@ from matsciml.common.packages import package_registry from matsciml.lightning.ddp import MPIEnvironment from pytorch_lightning.accelerators import Accelerator, AcceleratorRegistry -from pytorch_lightning.strategies import SingleDeviceStrategy +from pytorch_lightning.strategies import SingleDeviceStrategy, StrategyRegistry from pytorch_lightning.strategies.ddp import DDPStrategy import torch from torch import distributed as dist @@ -165,7 +165,7 @@ class DDPXPUStrategy(DDPStrategy): distributed data parallelism. """ - strategy_name = "ddp_with_pvc" + strategy_name = "ddp_with_xpu" def __init__( self, @@ -207,3 +207,14 @@ def register_strategies(cls, strategy_registry) -> None: description=f"{cls.__class__.__name__} - uses distributed data parallelism" " to divide data across multiple XPU tiles.", ) + + StrategyRegistry.register( + "single_xpu", + SingleXPUStrategy, + description="Strategy utilizing a single Intel GPU device or tile.", + ) + StrategyRegistry.register( + "ddp_with_xpu", + DDPXPUStrategy, + description="Distributed data parallel strategy using multiple Intel GPU devices or tiles.", + ) From 15b5c0a6538ccb11b8a37b58d1cf412d6d3d0084 Mon Sep 17 00:00:00 2001 From: Kin Long Kelvin Lee Date: Thu, 14 Mar 2024 17:15:32 -0700 Subject: [PATCH 10/16] fix: modernizing ddp code for up to date lightning Signed-off-by: Kin Long Kelvin Lee --- matsciml/lightning/ddp.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/matsciml/lightning/ddp.py b/matsciml/lightning/ddp.py index 31a54bc3..51de3cf5 100644 --- a/matsciml/lightning/ddp.py +++ b/matsciml/lightning/ddp.py @@ -7,17 +7,16 @@ from typing import Any, Callable import torch -from lightning_fabric.plugins.collectives.torch_collective import default_pg_timeout import pytorch_lightning as pl from pytorch_lightning.plugins import CheckpointIO from pytorch_lightning.plugins.environments import LightningEnvironment -from pytorch_lightning.plugins.precision import PrecisionPlugin +from pytorch_lightning.plugins.precision import Precision from pytorch_lightning.strategies import StrategyRegistry from pytorch_lightning.strategies.ddp import DDPStrategy __all__ = ["MPIEnvironment", "MPIDDPStrategy"] -# majority of these imports are just for type hinting! +default_pg_timeout = timedelta(seconds=1800) class MPIEnvironment(LightningEnvironment): @@ -69,7 +68,7 @@ def __init__( accelerator: pl.accelerators.Accelerator | None = None, parallel_devices: list[torch.device] | None = None, checkpoint_io: CheckpointIO | None = None, - precision_plugin: PrecisionPlugin | None = None, + precision_plugin: Precision | None = None, ddp_comm_state: object | None = None, ddp_comm_hook: Callable | None = None, ddp_comm_wrapper: Callable | None = None, From c4ae02fbc09c94ca791401ced84bd4df1abcb775 Mon Sep 17 00:00:00 2001 From: Kin Long Kelvin Lee Date: Thu, 14 Mar 2024 17:53:15 -0700 Subject: [PATCH 11/16] fix: making sure accelerator and strategy are used properly Signed-off-by: Kin Long Kelvin Lee --- examples/devices/xpu_example.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/devices/xpu_example.py b/examples/devices/xpu_example.py index 23ca1671..616f9433 100644 --- a/examples/devices/xpu_example.py +++ b/examples/devices/xpu_example.py @@ -4,6 +4,9 @@ from matsciml.datasets.transforms import PointCloudToGraphTransform from matsciml.lightning.data_utils import MatSciMLDataModule + +# this is needed to register strategy and accelerator +from matsciml.lightning import xpu # noqa: F401 from matsciml.models.base import ScalarRegressionTask from matsciml.models.pyg import EGNN @@ -33,5 +36,5 @@ ) # run a quick training loop on a single XPU device -trainer = pl.Trainer(fast_dev_run=10, strategy="single_pvc") +trainer = pl.Trainer(fast_dev_run=10, strategy="single_pvc", accelerator="xpu") trainer.fit(task, datamodule=dm) From 30409ec90af244fff51b3d8f86a7009ecdb9875f Mon Sep 17 00:00:00 2001 From: Kin Long Kelvin Lee Date: Thu, 14 Mar 2024 17:56:32 -0700 Subject: [PATCH 12/16] script: adding bf16 configuration to example Signed-off-by: Kin Long Kelvin Lee --- examples/devices/xpu_example.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/devices/xpu_example.py b/examples/devices/xpu_example.py index 616f9433..3fc15912 100644 --- a/examples/devices/xpu_example.py +++ b/examples/devices/xpu_example.py @@ -35,6 +35,8 @@ }, ) -# run a quick training loop on a single XPU device -trainer = pl.Trainer(fast_dev_run=10, strategy="single_pvc", accelerator="xpu") +# run a quick training loop on a single XPU device with BF16 automatic mixed precision +trainer = pl.Trainer( + fast_dev_run=10, strategy="single_pvc", accelerator="xpu", precision="bf16-mixed" +) trainer.fit(task, datamodule=dm) From 8c85bbf3218e666bd1626a8dec619e0172a21caf Mon Sep 17 00:00:00 2001 From: Kin Long Kelvin Lee Date: Thu, 14 Mar 2024 18:02:45 -0700 Subject: [PATCH 13/16] fix: making ipex import at the beginning captured by exception Signed-off-by: Kin Long Kelvin Lee --- matsciml/__init__.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/matsciml/__init__.py b/matsciml/__init__.py index 5eb6edc5..59500844 100644 --- a/matsciml/__init__.py +++ b/matsciml/__init__.py @@ -1,11 +1,19 @@ from __future__ import annotations -__version__ = "1.1.0" +from logging import getLogger # determine if intel libraries are available from matsciml.common.packages import package_registry +__version__ = "1.1.0" + +logger = getLogger(__file__) + + if package_registry["ipex"]: - import intel_extension_for_pytorch # noqa: F401 + try: + import intel_extension_for_pytorch # noqa: F401 + except ImportError as e: + logger.warning(f"Unable to load IPEX because of {e} - XPU may not function.") if package_registry["ccl"]: import oneccl_bindings_for_pytorch # noqa: F401 From 08d515f85c06f08af6b5c6f2c1ff83744d5cc496 Mon Sep 17 00:00:00 2001 From: Kin Long Kelvin Lee Date: Thu, 14 Mar 2024 18:07:19 -0700 Subject: [PATCH 14/16] fix: correcting github action to reinstall matsciml Signed-off-by: Kin Long Kelvin Lee --- .github/workflows/run_pytest_endtoend.yml | 4 ++++ .github/workflows/run_pytest_lightning.yml | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/.github/workflows/run_pytest_endtoend.yml b/.github/workflows/run_pytest_endtoend.yml index ac4be747..d851fcb3 100644 --- a/.github/workflows/run_pytest_endtoend.yml +++ b/.github/workflows/run_pytest_endtoend.yml @@ -25,6 +25,10 @@ jobs: cache-environment: true post-cleanup: 'all' generate-run-shell: true + - name: Install current version of matsciml + run: | + pip install . + shell: micromamba-shell {0} - name: Install PyTest run: | pip install pytest pytest-dependency diff --git a/.github/workflows/run_pytest_lightning.yml b/.github/workflows/run_pytest_lightning.yml index fc333f0b..bb5d6dc0 100644 --- a/.github/workflows/run_pytest_lightning.yml +++ b/.github/workflows/run_pytest_lightning.yml @@ -19,6 +19,10 @@ jobs: cache-environment: true post-cleanup: 'all' generate-run-shell: true + - name: Install current matsciml + run: | + pip install . + shell: micromamba-shell {0} - name: Install PyTest run: | pip install pytest pytest-dependency From dea11edf0e180775bee0a5fb329076521c3a0ac2 Mon Sep 17 00:00:00 2001 From: Kin Long Kelvin Lee Date: Thu, 14 Mar 2024 18:10:42 -0700 Subject: [PATCH 15/16] fix: added another exception handling for ipex load errors Signed-off-by: Kin Long Kelvin Lee --- matsciml/lightning/xpu.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/matsciml/lightning/xpu.py b/matsciml/lightning/xpu.py index cc09419c..53b94b32 100644 --- a/matsciml/lightning/xpu.py +++ b/matsciml/lightning/xpu.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: MIT License from __future__ import annotations from datetime import timedelta +from logging import getLogger from typing import Callable, Union, List, Dict, Any from pytorch_lightning.plugins import CheckpointIO, ClusterEnvironment @@ -17,8 +18,13 @@ default_pg_timeout = timedelta(seconds=1800) +logger = getLogger(__file__) + if package_registry["ipex"]: - import intel_extension_for_pytorch as ipex # noqa: F401 + try: + import intel_extension_for_pytorch as ipex # noqa: F401 + except ImportError as e: + logger.warning(f"Unable to import IPEX due to {e} - XPU may not function.") __all__ = ["XPUAccelerator", "SingleXPUStrategy", "DDPXPUStrategy"] From fc4404094067d7c10563e3cec52f77e448b49d62 Mon Sep 17 00:00:00 2001 From: Kin Long Kelvin Lee Date: Thu, 14 Mar 2024 18:18:37 -0700 Subject: [PATCH 16/16] fix: correcting strategy string name in example script Signed-off-by: Kin Long Kelvin Lee --- examples/devices/xpu_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/devices/xpu_example.py b/examples/devices/xpu_example.py index 3fc15912..83887cfd 100644 --- a/examples/devices/xpu_example.py +++ b/examples/devices/xpu_example.py @@ -37,6 +37,6 @@ # run a quick training loop on a single XPU device with BF16 automatic mixed precision trainer = pl.Trainer( - fast_dev_run=10, strategy="single_pvc", accelerator="xpu", precision="bf16-mixed" + fast_dev_run=10, strategy="single_xpu", accelerator="xpu", precision="bf16-mixed" ) trainer.fit(task, datamodule=dm)