From 55c9c72d8ad6709dc8591019dee44d48f0897ac8 Mon Sep 17 00:00:00 2001
From: rlratzel <rratzel@nvidia.com>
Date: Wed, 16 Oct 2024 16:19:45 -0500
Subject: [PATCH] Temporarily adds new dataset for benchmarking in the 100k
 node 1M edge range.

---
 .../nx-cugraph/pytest-based/bench_algos.py    | 50 ++++++++++++++-----
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
index f88d93c3f17..9e80332351d 100644
--- a/benchmarks/nx-cugraph/pytest-based/bench_algos.py
+++ b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
@@ -37,6 +37,40 @@
 iterations = 1
 warmup_rounds = 1
 
+# FIXME: Add this to cugraph.datasets.  This is done here so these benchmarks
+# can be run without requiring an updated cugraph install.  This temporarily
+# adds a dataset based on an Amazon product co-purchasing network.
+amazon0302_metadata = """
+name: amazon0302
+description:
+  Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003.
+author: J. Leskovec, L. Adamic and B. Adamic
+refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007.
+delim: "\t"
+header: 3
+col_names:
+  - FromNodeId
+  - ToNodeId
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 1234877
+number_of_nodes: 262111
+url: https://snap.stanford.edu/data/amazon0302.txt.gz
+"""
+amazon0302_metadata_file_name = datasets.default_download_dir.path / "amazon0302.yaml"
+if not amazon0302_metadata_file_name.exists():
+    amazon0302_metadata_file_name.parent.mkdir(parents=True, exist_ok=True)
+    with open(amazon0302_metadata_file_name, "w") as f:
+        f.write(amazon0302_metadata)
+
+amazon0302_dataset = datasets.Dataset(amazon0302_metadata_file_name)
+amazon0302_dataset.metadata["file_type"] = ".gz"
+
 dataset_param_values = [
     # name: karate, nodes: 34, edges: 156
     pytest.param(datasets.karate, marks=[pytest.mark.small, pytest.mark.undirected]),
@@ -46,6 +80,8 @@
     pytest.param(
         datasets.email_Eu_core, marks=[pytest.mark.small, pytest.mark.directed]
     ),
+    # name: amazon0302, nodes: 262111, edges: 1234877
+    pytest.param(amazon0302_dataset, marks=[pytest.mark.medium, pytest.mark.directed]),
     # name: cit-Patents, nodes: 3774768, edges: 16518948
     pytest.param(
         datasets.cit_patents, marks=[pytest.mark.medium, pytest.mark.directed]
@@ -113,19 +149,7 @@ def nx_graph_from_dataset(dataset_obj):
     """
     create_using = nx.DiGraph if dataset_obj.metadata["is_directed"] else nx.Graph
     names = dataset_obj.metadata["col_names"]
-    dtypes = dataset_obj.metadata["col_types"]
-    if isinstance(dataset_obj.metadata["header"], int):
-        header = dataset_obj.metadata["header"]
-    else:
-        header = None
-
-    pandas_edgelist = pd.read_csv(
-        dataset_obj.get_path(),
-        delimiter=dataset_obj.metadata["delim"],
-        names=names,
-        dtype=dict(zip(names, dtypes)),
-        header=header,
-    )
+    pandas_edgelist = dataset_obj.get_edgelist(download=True, reader="pandas")
     G = nx.from_pandas_edgelist(
         pandas_edgelist, source=names[0], target=names[1], create_using=create_using
     )