refactor DTensor output to ShardingEnv from fused params

Summary: We refactor the way users enable DTensor output in state dict from fused parameters to ShardingEnv. Fused params is meant to eventually be passed onto TBE which does not align with using it for `output_dtensor` flag, we would have to pop the flag out from the fused params dict before it passed on to TBE leading to poor design. Changing it to be informed from ShardingEnv aligns more closely with how the flag is used and fits into the design of ShardingEnv. In the sense that ShardingEnv informs TorchRec of the environment it is in and it's parameters (in the case of 2D, it informs the device mesh, global pg, sharding pg, etc). From users perspective, they have to include the flag into ShardingEnv construction which is a simpler change than adding to fused params. From the perspective of trainers, changing fused params can cause a lot of breaking changes. This also allows us to enable DTensor by default in 2D parallel cases ensuring no potential user error. Differential Revision: D67307210
iamzainhuda · Dec 17, 2024 · 1bdf7d1 · 1bdf7d1
1 parent ac4d360
commit 1bdf7d1
Show file tree

Hide file tree

Showing 7 changed files with 9 additions and 17 deletions.
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -610,9 +610,7 @@ def __init__(
         )
         self._env = env
         # output parameters as DTensor in state dict
-        self._output_dtensor: bool = (
-            fused_params.get("output_dtensor", False) if fused_params else False
-        )
+        self._output_dtensor: bool = env.output_dtensor
 
         sharding_type_to_sharding_infos = create_sharding_infos_by_sharding(
             module,

diff --git a/torchrec/distributed/sharding/cw_sharding.py b/torchrec/distributed/sharding/cw_sharding.py
@@ -169,7 +169,7 @@ def _shard(
             )
 
             dtensor_metadata = None
-            if info.fused_params.get("output_dtensor", False):  # pyre-ignore[16]
+            if self._env.output_dtensor:
                 dtensor_metadata = DTensorMetadata(
                     mesh=self._env.device_mesh,
                     placements=(
@@ -186,8 +186,6 @@ def _shard(
                     ),
                     stride=info.param.stride(),
                 )
-            # to not pass onto TBE
-            info.fused_params.pop("output_dtensor", None)  # pyre-ignore[16]
 
             # pyre-fixme [6]
             for i, rank in enumerate(info.param_sharding.ranks):

diff --git a/torchrec/distributed/sharding/grid_sharding.py b/torchrec/distributed/sharding/grid_sharding.py
@@ -232,7 +232,7 @@ def _shard(
             )
 
             dtensor_metadata = None
-            if info.fused_params.get("output_dtensor", False):  # pyre-ignore[16]
+            if self._env.output_dtensor:
                 placements = (
                     (Replicate(), Shard(1)) if self._is_2D_parallel else (Shard(1),)
                 )
@@ -246,9 +246,6 @@ def _shard(
                     stride=info.param.stride(),
                 )
 
-            # to not pass onto TBE
-            info.fused_params.pop("output_dtensor", None)  # pyre-ignore[16]
-
             # Expectation is planner CW shards across a node, so each CW shard will have local_size number of row shards
             # pyre-fixme [6]
             for i, rank in enumerate(info.param_sharding.ranks):

diff --git a/torchrec/distributed/sharding/rw_sharding.py b/torchrec/distributed/sharding/rw_sharding.py
@@ -179,7 +179,7 @@ def _shard(
             )
 
             dtensor_metadata = None
-            if info.fused_params.get("output_dtensor", False):  # pyre-ignore[16]
+            if self._env.output_dtensor:
                 placements = (
                     (Replicate(), Shard(0)) if self._is_2D_parallel else (Shard(0),)
                 )
@@ -197,8 +197,6 @@ def _shard(
                     ),
                     stride=info.param.stride(),
                 )
-            # to not pass onto TBE
-            info.fused_params.pop("output_dtensor", None)  # pyre-ignore[16]
 
             for rank in range(self._world_size):
                 tables_per_rank[rank].append(

diff --git a/torchrec/distributed/sharding/twrw_sharding.py b/torchrec/distributed/sharding/twrw_sharding.py
@@ -164,7 +164,7 @@ def _shard(
             )
 
             dtensor_metadata = None
-            if info.fused_params.get("output_dtensor", False):  # pyre-ignore[16]
+            if self._env.output_dtensor:
                 placements = (Shard(0),)
                 dtensor_metadata = DTensorMetadata(
                     mesh=self._env.device_mesh,
@@ -175,8 +175,6 @@ def _shard(
                     ),
                     stride=info.param.stride(),
                 )
-            # to not pass onto TBE
-            info.fused_params.pop("output_dtensor", None)  # pyre-ignore[16]
 
             for rank in range(
                 table_node * local_size,

diff --git a/torchrec/distributed/tests/test_2d_sharding.py b/torchrec/distributed/tests/test_2d_sharding.py
@@ -466,8 +466,8 @@ def test_sharding_twrw_2D(
 
         self._test_sharding(
             world_size=self.WORLD_SIZE,
-            local_size=self.WORLD_SIZE_2D // 2,
             world_size_2D=self.WORLD_SIZE_2D,
+            node_group_size=self.WORLD_SIZE // 4,
             sharders=[
                 cast(
                     ModuleSharder[nn.Module],

diff --git a/torchrec/distributed/types.py b/torchrec/distributed/types.py
@@ -813,6 +813,7 @@ def __init__(
         world_size: int,
         rank: int,
         pg: Optional[dist.ProcessGroup] = None,
+        output_dtensor: bool = False,
     ) -> None:
         self.world_size = world_size
         self.rank = rank
@@ -825,6 +826,7 @@ def __init__(
             if pg
             else None
         )
+        self.output_dtensor: bool = output_dtensor
 
     @classmethod
     def from_process_group(cls, pg: dist.ProcessGroup) -> "ShardingEnv":
@@ -886,6 +888,7 @@ def __init__(
         self.sharding_pg: dist.ProcessGroup = sharding_pg
         self.device_mesh: DeviceMesh = device_mesh
         self.node_group_size: Optional[int] = node_group_size
+        self.output_dtensor: bool = True
 
     def num_sharding_groups(self) -> int:
         """