add size and stride for empty shard DT (pytorch#2662)

Summary: Bringing DT empty shard on rank to behave the same as ST empty shard. For OT, our current DT approach broke transfer learning because they expect the tensor.size() to return global shape, we amend the DT empty shard init to include global shape and stride. Differential Revision: D67727355
iamzainhuda · Jan 2, 2025 · eb8f8e3 · eb8f8e3
1 parent 455de88
commit eb8f8e3
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 1 deletion.
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -73,6 +73,7 @@
     add_params_from_parameter_sharding,
     append_prefix,
     convert_to_fbgemm_types,
+    create_global_tensor_shape_stride_from_metadata,
     maybe_annotate_embedding_event,
     merge_fused_params,
     none_throws,
@@ -918,6 +919,9 @@ def _initialize_torch_state(self) -> None:  # noqa
                         )
                     )
                 else:
+                    shape, stride = create_global_tensor_shape_stride_from_metadata(
+                        none_throws(self.module_sharding_plan[table_name])
+                    )
                     # empty shard case
                     self._model_parallel_name_to_dtensor[table_name] = (
                         DTensor.from_local(
@@ -927,6 +931,8 @@ def _initialize_torch_state(self) -> None:  # noqa
                             ),
                             device_mesh=self._env.device_mesh,
                             run_check=False,
+                            shape=shape,
+                            stride=stride,
                         )
                     )
             else:

diff --git a/torchrec/distributed/utils.py b/torchrec/distributed/utils.py
@@ -15,7 +15,7 @@
 from collections import OrderedDict
 from contextlib import AbstractContextManager, nullcontext
 from dataclasses import asdict
-from typing import Any, Dict, List, Optional, Set, Type, TypeVar, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Type, TypeVar, Union
 
 import torch
 from fbgemm_gpu.split_embedding_configs import EmbOptimType
@@ -511,3 +511,36 @@ def interaction(self, *args, **kwargs) -> None:
             pdb.Pdb.interaction(self, *args, **kwargs)
         finally:
             sys.stdin = _stdin
+
+
+def create_global_tensor_shape_stride_from_metadata(
+    parameter_sharding: ParameterSharding,
+) -> Tuple[torch.Size, tuple[int, ...]]:
+    """
+    Create a global tensor shape and stride from shard metadata.
+
+    Returns:
+        torch.Size: global tensor shape.
+        tuple: global tensor stride.
+    """
+    size = torch.Size([0, 0])
+    stride = (0, 0)
+    if parameter_sharding.sharding_type == "column_wise":
+        row_dim = parameter_sharding.sharding_spec.shards[0].shard_sizes[0]  # pyre-ignore[16]
+        col_dim = 0
+        for shard in parameter_sharding.sharding_spec.shards:
+            col_dim += shard.shard_sizes[1]
+        size = torch.Size([row_dim, col_dim])
+        stride = (row_dim, 1)
+    elif (
+        parameter_sharding.sharding_type == "table_row_wise"
+        or parameter_sharding.sharding_type == "row_wise"
+    ):
+        row_dim = 0
+        col_dim = parameter_sharding.sharding_spec.shards[0].shard_sizes[1]
+        for shard in parameter_sharding.sharding_spec.shards:
+            row_dim += shard.shard_sizes[0]
+    elif parameter_sharding.sharding_type == "table_wise":
+        size = torch.Size(parameter_sharding.sharding_spec.shards[0].shard_sizes)
+        stride = (size[1], 1)
+    return size, stride