Adds new dataset for benchmarking in the 100k node 1M edge range, add…

…s additional k-values for BC benchmarks (#4726) Adds new dataset for benchmarking in the 100k node 1M edge range. This also updates the benchmark fixture to download the dataset instead of requiring a separate script. This also includes changes from #4721 to add additional k-values for BC benchmarks to run. The addition was done with an inline yaml file and a followup PR will be done to properly add this dataset to the cugraph.datasets API. The inline yaml was done in order to allow benchmarks to run on a system with an existing cugraph installation by simply copying this bench_algos.py, rather than require an updated cugraph install with the new dataset metadata. Authors: - Rick Ratzel (https://github.com/rlratzel) - Ralph Liu (https://github.com/nv-rliu) Approvers: - Ralph Liu (https://github.com/nv-rliu) - Don Acosta (https://github.com/acostadon) URL: #4726
rapidsai · Oct 17, 2024 · 0c81f15 · 0c81f15
1 parent 0d0d28a
commit 0c81f15
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 58 deletions.
diff --git a/benchmarks/nx-cugraph/pytest-based/README.md b/benchmarks/nx-cugraph/pytest-based/README.md
@@ -21,7 +21,9 @@ Our current benchmarks provide the following datasets:
 #### 1. `run-main-benchmarks.sh`
 This script allows users to run a small set of commonly-used algorithms across multiple datasets and backends. All results are stored inside a sub-directory (`logs/`) and output files are named based on the combination of parameters for that benchmark.
 
-NOTE: If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running.
+NOTE:
+ - If running with all algorithms and datasets using NetworkX without an accelerated backend, this script may take a few hours to finish running.
+ - The `betweenness_centrality` benchmark will run with values `[10, 20, 50, 100, 500, 1000]` by default. You can specify only specific k-values to be run by editing `bc_k_values` (line 46) to be passed as a [pytest keyword object](https://docs.pytest.org/en/6.2.x/usage.html#specifying-tests-selecting-tests).
 
 **Usage:**
  - Run with `--cpu-only`:

diff --git a/benchmarks/nx-cugraph/pytest-based/bench_algos.py b/benchmarks/nx-cugraph/pytest-based/bench_algos.py
@@ -37,6 +37,40 @@
 iterations = 1
 warmup_rounds = 1
 
+# FIXME: Add this to cugraph.datasets.  This is done here so these benchmarks
+# can be run without requiring an updated cugraph install.  This temporarily
+# adds a dataset based on an Amazon product co-purchasing network.
+amazon0302_metadata = """
+name: amazon0302
+description:
+  Network was collected by crawling Amazon website. It is based on Customers Who Bought This Item Also Bought feature of the Amazon website. If a product i is frequently co-purchased with product j, the graph contains a directed edge from i to j. The data was collected in March 02 2003.
+author: J. Leskovec, L. Adamic and B. Adamic
+refs: J. Leskovec, L. Adamic and B. Adamic. The Dynamics of Viral Marketing. ACM Transactions on the Web (ACM TWEB), 1(1), 2007.
+delim: "\t"
+header: 3
+col_names:
+  - FromNodeId
+  - ToNodeId
+col_types:
+  - int32
+  - int32
+has_loop: false
+is_directed: true
+is_multigraph: false
+is_symmetric: false
+number_of_edges: 1234877
+number_of_nodes: 262111
+url: https://snap.stanford.edu/data/amazon0302.txt.gz
+"""
+amazon0302_metadata_file_name = datasets.default_download_dir.path / "amazon0302.yaml"
+if not amazon0302_metadata_file_name.exists():
+    amazon0302_metadata_file_name.parent.mkdir(parents=True, exist_ok=True)
+    with open(amazon0302_metadata_file_name, "w") as f:
+        f.write(amazon0302_metadata)
+
+amazon0302_dataset = datasets.Dataset(amazon0302_metadata_file_name)
+amazon0302_dataset.metadata["file_type"] = ".gz"
+
 dataset_param_values = [
     # name: karate, nodes: 34, edges: 156
     pytest.param(datasets.karate, marks=[pytest.mark.small, pytest.mark.undirected]),
@@ -46,6 +80,8 @@
     pytest.param(
         datasets.email_Eu_core, marks=[pytest.mark.small, pytest.mark.directed]
     ),
+    # name: amazon0302, nodes: 262111, edges: 1234877
+    pytest.param(amazon0302_dataset, marks=[pytest.mark.medium, pytest.mark.directed]),
     # name: cit-Patents, nodes: 3774768, edges: 16518948
     pytest.param(
         datasets.cit_patents, marks=[pytest.mark.medium, pytest.mark.directed]
@@ -113,19 +149,7 @@ def nx_graph_from_dataset(dataset_obj):
     """
     create_using = nx.DiGraph if dataset_obj.metadata["is_directed"] else nx.Graph
     names = dataset_obj.metadata["col_names"]
-    dtypes = dataset_obj.metadata["col_types"]
-    if isinstance(dataset_obj.metadata["header"], int):
-        header = dataset_obj.metadata["header"]
-    else:
-        header = None
-
-    pandas_edgelist = pd.read_csv(
-        dataset_obj.get_path(),
-        delimiter=dataset_obj.metadata["delim"],
-        names=names,
-        dtype=dict(zip(names, dtypes)),
-        header=header,
-    )
+    pandas_edgelist = dataset_obj.get_edgelist(download=True, reader="pandas")
     G = nx.from_pandas_edgelist(
         pandas_edgelist, source=names[0], target=names[1], create_using=create_using
     )
@@ -272,7 +296,7 @@ def bench_from_networkx(benchmark, graph_obj):
 
 # normalized_param_values = [True, False]
 normalized_param_values = [True]
-k_param_values = [10, 100, 1000]
+k_param_values = [10, 20, 50, 100, 500, 1000]
 
 
 @pytest.mark.parametrize(
@@ -281,7 +305,6 @@ def bench_from_networkx(benchmark, graph_obj):
 @pytest.mark.parametrize("k", k_param_values, ids=lambda k: f"{k=}")
 def bench_betweenness_centrality(benchmark, graph_obj, backend_wrapper, normalized, k):
     G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)
-
     if k > G.number_of_nodes():
         pytest.skip(reason=f"{k=} > {G.number_of_nodes()=}")
 

diff --git a/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py b/benchmarks/nx-cugraph/pytest-based/create_results_summary_page.py
@@ -166,6 +166,7 @@ def get_system_info():
     ordered_datasets = [
         "netscience",
         "email_Eu_core",
+        "amazon0302",
         "cit-patents",
         "hollywood",
         "soc-livejournal1",
@@ -174,6 +175,7 @@ def get_system_info():
     dataset_meta = {
         "netscience": ["1,461", "5,484", "Yes"],
         "email_Eu_core": ["1,005", "25,571", "Yes"],
+        "amazon0302": ["262,111", "1,234,877", "Yes"],
         "cit-patents": ["3,774,768", "16,518,948", "Yes"],
         "hollywood": ["1,139,905", "57,515,616", "No"],
         "soc-livejournal1": ["4,847,571", "68,993,773", "Yes"],

diff --git a/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py b/benchmarks/nx-cugraph/pytest-based/get_graph_bench_dataset.py
diff --git a/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh b/benchmarks/nx-cugraph/pytest-based/run-main-benchmarks.sh
@@ -14,7 +14,7 @@
 
 
 # location to store datasets used for benchmarking
-export RAPIDS_DATASET_ROOT_DIR=/datasets/cugraph
+export RAPIDS_DATASET_ROOT_DIR=${RAPIDS_DATASET_ROOT_DIR:-/datasets/cugraph}
 mkdir -p logs
 
 # list of algos, datasets, and back-ends to use in combinations
@@ -30,6 +30,7 @@ algos="
 datasets="
    netscience
    email_Eu_core
+   amazon0302
    cit-patents
    hollywood
    soc-livejournal
@@ -40,6 +41,11 @@ backends="
     None
     cugraph-preconverted
 "
+
+# edit this directly to for pytest
+# e.g. -k "and not 100 and not 1000"
+bc_k_values=""
+
 # check for --cpu-only or --gpu-only args
 if [[ "$#" -eq 1 ]]; then
     case $1 in
@@ -58,15 +64,15 @@ fi
 
 for algo in $algos; do
     for dataset in $datasets; do
-	# this script can be used to download benchmarking datasets by name via cugraph.datasets
-    	python get_graph_bench_dataset.py $dataset
         for backend in $backends; do
             name="${backend}__${algo}__${dataset}"
             echo "Running: $backend, $dataset, bench_$algo"
-            # command to preproduce test
-            # echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo and not 1000\" --benchmark-json=\"logs/${name}.json\" bench_algos.py"
-            pytest -sv \
-                -k "$backend and $dataset and bench_$algo and not 1000" \
+
+            # uncomment to get command for reproducing test
+            # echo "RUNNING: \"pytest -sv -k \"$backend and $dataset and bench_$algo $bc_k_values\" --benchmark-json=\"logs/${name}.json\" bench_algos.py"
+
+            pytest -sv --co \
+                -k "$backend and $dataset and bench_$algo $bc_k_values" \
                 --benchmark-json="logs/${name}.json" \
                 bench_algos.py 2>&1 | tee "logs/${name}.out"
         done