Skip to content

Commit

Permalink
[GraphBolt] fix preprocess issue for single ntype/etype graph (#7011)
Browse files Browse the repository at this point in the history
  • Loading branch information
Rhett-Ying authored Jan 25, 2024
1 parent 0bfe34d commit 523bbb4
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 15 deletions.
45 changes: 30 additions & 15 deletions python/dgl/graphbolt/impl/ondisk_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,18 @@ def preprocess_ondisk_dataset(
# 2. Load the edge data and create a DGLGraph.
if "graph" not in input_config:
raise RuntimeError("Invalid config: does not contain graph field.")
is_homogeneous = "type" not in input_config["graph"]["nodes"][0]
# For any graph that node/edge types are specified, we construct DGLGraph
# with `dgl.heterograph()` even there's only one node/edge type. This is
# because we want to save the node/edge types in the graph. So the logic of
# checking whether the graph is homogeneous is different from the logic in
# `DGLGraph.is_homogeneous()`. Otherwise, we construct DGLGraph with
# `dgl.graph()`.
is_homogeneous = (
len(input_config["graph"]["nodes"]) == 1
and len(input_config["graph"]["edges"]) == 1
and "type" not in input_config["graph"]["nodes"][0]
and "type" not in input_config["graph"]["edges"][0]
)
if is_homogeneous:
# Homogeneous graph.
num_nodes = input_config["graph"]["nodes"][0]["num"]
Expand Down Expand Up @@ -178,20 +189,24 @@ def preprocess_ondisk_dataset(
if not is_homogeneous:
# For heterogenous graph, a node/edge feature must cover all
# node/edge types.
for feat_name, feat_data in g.ndata.items():
existing_types = set(feat_data.keys())
assert existing_types == set(g.ntypes), (
f"Node feature {feat_name} does not cover all node types."
+ f"Existing types: {existing_types}."
+ f"Expected types: {g.ntypes}."
)
for feat_name, feat_data in g.edata.items():
existing_types = set(feat_data.keys())
assert existing_types == set(g.canonical_etypes), (
f"Edge feature {feat_name} does not cover all edge types."
+ f"Existing types: {existing_types}."
+ f"Expected types: {g.etypes}."
)
ntypes = g.ntypes
assert all(
set(g.nodes[ntypes[0]].data.keys())
== set(g.nodes[ntype].data.keys())
for ntype in ntypes
), (
"Node feature does not cover all node types: "
+ f"{set(g.nodes[ntype].data.keys() for ntype in ntypes)}."
)
etypes = g.canonical_etypes
assert all(
set(g.edges[etypes[0]].data.keys())
== set(g.edges[etype].data.keys())
for etype in etypes
), (
"Edge feature does not cover all edge types: "
+ f"{set(g.edges[etype].data.keys() for etype in etypes)}."
)

# 4. Convert the DGLGraph to a FusedCSCSamplingGraph.
fused_csc_sampling_graph = from_dglgraph(
Expand Down
80 changes: 80 additions & 0 deletions tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2742,3 +2742,83 @@ def test_OnDiskDataset_load_tasks_selectively():
dataset = gb.OnDiskDataset(test_dir).load(tasks=2)

dataset = None


def test_OnDiskDataset_preprocess_graph_with_single_type():
"""Test for graph with single node/edge type."""
with tempfile.TemporaryDirectory() as test_dir:
# All metadata fields are specified.
dataset_name = "graphbolt_test"
num_nodes = 4000
num_edges = 20000

# Generate random edges.
nodes = np.repeat(np.arange(num_nodes), 5)
neighbors = np.random.randint(0, num_nodes, size=(num_edges))
edges = np.stack([nodes, neighbors], axis=1)
# Wrtie into edges/edge.csv
os.makedirs(os.path.join(test_dir, "edges/"), exist_ok=True)
edges = pd.DataFrame(edges, columns=["src", "dst"])
edges.to_csv(
os.path.join(test_dir, "edges/edge.csv"),
index=False,
header=False,
)

# Generate random graph edge-feats.
edge_feats = np.random.rand(num_edges, 5)
os.makedirs(os.path.join(test_dir, "data/"), exist_ok=True)
np.save(os.path.join(test_dir, "data/edge-feat.npy"), edge_feats)

# Generate random node-feats.
node_feats = np.random.rand(num_nodes, 10)
np.save(os.path.join(test_dir, "data/node-feat.npy"), node_feats)

yaml_content = f"""
dataset_name: {dataset_name}
graph: # graph structure and required attributes.
nodes:
- num: {num_nodes}
type: author
edges:
- type: author:collab:author
format: csv
path: edges/edge.csv
feature_data:
- domain: edge
type: author:collab:author
name: feat
format: numpy
path: data/edge-feat.npy
- domain: node
type: author
name: feat
format: numpy
path: data/node-feat.npy
"""
yaml_file = os.path.join(test_dir, "metadata.yaml")
with open(yaml_file, "w") as f:
f.write(yaml_content)

dataset = gb.OnDiskDataset(test_dir).load()
assert dataset.dataset_name == dataset_name

graph = dataset.graph
assert isinstance(graph, gb.FusedCSCSamplingGraph)
assert graph.total_num_nodes == num_nodes
assert graph.total_num_edges == num_edges
assert (
graph.node_attributes is not None
and "feat" in graph.node_attributes
)
assert (
graph.edge_attributes is not None
and "feat" in graph.edge_attributes
)
assert torch.equal(graph.node_type_offset, torch.tensor([0, num_nodes]))
assert torch.equal(
graph.type_per_edge,
torch.zeros(num_edges),
)
assert graph.edge_type_to_id == {"author:collab:author": 0}
assert graph.node_type_to_id == {"author": 0}

0 comments on commit 523bbb4

Please sign in to comment.