From 523bbb4cb18a876d03a7df9d9da734482d784f93 Mon Sep 17 00:00:00 2001
From: Rhett Ying <85214957+Rhett-Ying@users.noreply.github.com>
Date: Thu, 25 Jan 2024 16:03:25 +0800
Subject: [PATCH] [GraphBolt] fix preprocess issue for single ntype/etype graph
 (#7011)

---
 python/dgl/graphbolt/impl/ondisk_dataset.py   | 45 +++++++----
 .../graphbolt/impl/test_ondisk_dataset.py     | 80 +++++++++++++++++++
 2 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/python/dgl/graphbolt/impl/ondisk_dataset.py b/python/dgl/graphbolt/impl/ondisk_dataset.py
index 5da99fe74f23..e636e17e8f31 100644
--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -118,7 +118,18 @@ def preprocess_ondisk_dataset(
     # 2. Load the edge data and create a DGLGraph.
     if "graph" not in input_config:
         raise RuntimeError("Invalid config: does not contain graph field.")
-    is_homogeneous = "type" not in input_config["graph"]["nodes"][0]
+    # For any graph that node/edge types are specified, we construct DGLGraph
+    # with `dgl.heterograph()` even there's only one node/edge type. This is
+    # because we want to save the node/edge types in the graph. So the logic of
+    # checking whether the graph is homogeneous is different from the logic in
+    # `DGLGraph.is_homogeneous()`. Otherwise, we construct DGLGraph with
+    # `dgl.graph()`.
+    is_homogeneous = (
+        len(input_config["graph"]["nodes"]) == 1
+        and len(input_config["graph"]["edges"]) == 1
+        and "type" not in input_config["graph"]["nodes"][0]
+        and "type" not in input_config["graph"]["edges"][0]
+    )
     if is_homogeneous:
         # Homogeneous graph.
         num_nodes = input_config["graph"]["nodes"][0]["num"]
@@ -178,20 +189,24 @@ def preprocess_ondisk_dataset(
         if not is_homogeneous:
             # For heterogenous graph, a node/edge feature must cover all
             # node/edge types.
-            for feat_name, feat_data in g.ndata.items():
-                existing_types = set(feat_data.keys())
-                assert existing_types == set(g.ntypes), (
-                    f"Node feature {feat_name} does not cover all node types."
-                    + f"Existing types: {existing_types}."
-                    + f"Expected types: {g.ntypes}."
-                )
-            for feat_name, feat_data in g.edata.items():
-                existing_types = set(feat_data.keys())
-                assert existing_types == set(g.canonical_etypes), (
-                    f"Edge feature {feat_name} does not cover all edge types."
-                    + f"Existing types: {existing_types}."
-                    + f"Expected types: {g.etypes}."
-                )
+            ntypes = g.ntypes
+            assert all(
+                set(g.nodes[ntypes[0]].data.keys())
+                == set(g.nodes[ntype].data.keys())
+                for ntype in ntypes
+            ), (
+                "Node feature does not cover all node types: "
+                + f"{set(g.nodes[ntype].data.keys() for ntype in ntypes)}."
+            )
+            etypes = g.canonical_etypes
+            assert all(
+                set(g.edges[etypes[0]].data.keys())
+                == set(g.edges[etype].data.keys())
+                for etype in etypes
+            ), (
+                "Edge feature does not cover all edge types: "
+                + f"{set(g.edges[etype].data.keys() for etype in etypes)}."
+            )
 
     # 4. Convert the DGLGraph to a FusedCSCSamplingGraph.
     fused_csc_sampling_graph = from_dglgraph(
diff --git a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
index ef15453463e7..bf597593caf5 100644
--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -2742,3 +2742,83 @@ def test_OnDiskDataset_load_tasks_selectively():
             dataset = gb.OnDiskDataset(test_dir).load(tasks=2)
 
         dataset = None
+
+
+def test_OnDiskDataset_preprocess_graph_with_single_type():
+    """Test for graph with single node/edge type."""
+    with tempfile.TemporaryDirectory() as test_dir:
+        # All metadata fields are specified.
+        dataset_name = "graphbolt_test"
+        num_nodes = 4000
+        num_edges = 20000
+
+        # Generate random edges.
+        nodes = np.repeat(np.arange(num_nodes), 5)
+        neighbors = np.random.randint(0, num_nodes, size=(num_edges))
+        edges = np.stack([nodes, neighbors], axis=1)
+        # Wrtie into edges/edge.csv
+        os.makedirs(os.path.join(test_dir, "edges/"), exist_ok=True)
+        edges = pd.DataFrame(edges, columns=["src", "dst"])
+        edges.to_csv(
+            os.path.join(test_dir, "edges/edge.csv"),
+            index=False,
+            header=False,
+        )
+
+        # Generate random graph edge-feats.
+        edge_feats = np.random.rand(num_edges, 5)
+        os.makedirs(os.path.join(test_dir, "data/"), exist_ok=True)
+        np.save(os.path.join(test_dir, "data/edge-feat.npy"), edge_feats)
+
+        # Generate random node-feats.
+        node_feats = np.random.rand(num_nodes, 10)
+        np.save(os.path.join(test_dir, "data/node-feat.npy"), node_feats)
+
+        yaml_content = f"""
+            dataset_name: {dataset_name}
+            graph: # graph structure and required attributes.
+                nodes:
+                    - num: {num_nodes}
+                      type: author
+                edges:
+                    - type: author:collab:author
+                      format: csv
+                      path: edges/edge.csv
+                feature_data:
+                    - domain: edge
+                      type: author:collab:author
+                      name: feat
+                      format: numpy
+                      path: data/edge-feat.npy
+                    - domain: node
+                      type: author
+                      name: feat
+                      format: numpy
+                      path: data/node-feat.npy
+        """
+        yaml_file = os.path.join(test_dir, "metadata.yaml")
+        with open(yaml_file, "w") as f:
+            f.write(yaml_content)
+
+        dataset = gb.OnDiskDataset(test_dir).load()
+        assert dataset.dataset_name == dataset_name
+
+        graph = dataset.graph
+        assert isinstance(graph, gb.FusedCSCSamplingGraph)
+        assert graph.total_num_nodes == num_nodes
+        assert graph.total_num_edges == num_edges
+        assert (
+            graph.node_attributes is not None
+            and "feat" in graph.node_attributes
+        )
+        assert (
+            graph.edge_attributes is not None
+            and "feat" in graph.edge_attributes
+        )
+        assert torch.equal(graph.node_type_offset, torch.tensor([0, num_nodes]))
+        assert torch.equal(
+            graph.type_per_edge,
+            torch.zeros(num_edges),
+        )
+        assert graph.edge_type_to_id == {"author:collab:author": 0}
+        assert graph.node_type_to_id == {"author": 0}