From e201ce628e781769b54121ac83a10cfee379f6b8 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 22 Nov 2024 18:37:58 -0800 Subject: [PATCH 01/10] Upgrade dependency versions in pipeline Upgrade dependency versions in Azure pipeline: * Remove Python 3.6 and add Python 3.10 for cpu-unit-test * Upgrade CUDA from 11.1 to 12.4 for cuda-unit-test * Update labels accordingly --- .azure-pipelines/ansible-integration-test.yml | 1 + .azure-pipelines/cpu-unit-test.yml | 4 ++-- .azure-pipelines/cuda-unit-test.yml | 3 ++- .codecov.yml | 6 ++++-- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.azure-pipelines/ansible-integration-test.yml b/.azure-pipelines/ansible-integration-test.yml index f5b34dd60..9c8550966 100644 --- a/.azure-pipelines/ansible-integration-test.yml +++ b/.azure-pipelines/ansible-integration-test.yml @@ -7,6 +7,7 @@ trigger: pool: name: SuperBench CI + demands: ansible-agent vmImage: ubuntu-latest container: diff --git a/.azure-pipelines/cpu-unit-test.yml b/.azure-pipelines/cpu-unit-test.yml index 7fc698f4f..1de67824f 100644 --- a/.azure-pipelines/cpu-unit-test.yml +++ b/.azure-pipelines/cpu-unit-test.yml @@ -7,12 +7,12 @@ trigger: strategy: matrix: - python-3.6: - imageTag: '3.6' python-3.7: imageTag: '3.7' python-3.8: imageTag: '3.8' + python-3.10: + imageTag: '3.10' # TODO #python-latest: # imageTag: '3' diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml index e0a69fc0d..2dbdfb9a5 100644 --- a/.azure-pipelines/cuda-unit-test.yml +++ b/.azure-pipelines/cuda-unit-test.yml @@ -7,10 +7,11 @@ trigger: pool: name: SuperBench CI + demands: cuda-agent vmImage: ubuntu-latest container: - image: nvcr.io/nvidia/pytorch:20.12-py3 + image: nvcr.io/nvidia/pytorch:24.03-py3 options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/' steps: diff --git a/.codecov.yml b/.codecov.yml index 81d50f8bc..8f9f5de87 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -14,8 +14,9 @@ coverage: target: 80% threshold: 1% flags: - - cpu-python3.6-unit-test - cpu-python3.7-unit-test + - cpu-python3.8-unit-test + - cpu-python3.10-unit-test - cuda-unit-test - directx-unit-test patch: @@ -23,7 +24,8 @@ coverage: target: 80% threshold: 1% flags: - - cpu-python3.6-unit-test - cpu-python3.7-unit-test + - cpu-python3.8-unit-test + - cpu-python3.10-unit-test - cuda-unit-test - directx-unit-test From 1d2c652d30467bf1fd8e36adf7735c4ba5e881f5 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 22 Nov 2024 19:08:06 -0800 Subject: [PATCH 02/10] Fix sudo issue inside container Fix sudo issue inside container. --- .azure-pipelines/cuda-unit-test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml index 2dbdfb9a5..1885b7c85 100644 --- a/.azure-pipelines/cuda-unit-test.yml +++ b/.azure-pipelines/cuda-unit-test.yml @@ -12,18 +12,18 @@ pool: container: image: nvcr.io/nvidia/pytorch:24.03-py3 - options: '-v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker -v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/:/usr/lib/sudo/' + options: '--name cuda-ci -v /var/run/docker.sock:/var/run/docker.sock -v /usr/bin/docker:/usr/bin/docker:ro' steps: - script: | echo "##vso[task.prependpath]$HOME/.local/bin" displayName: Export path - script: | + docker exec -t -u root -e DEBIAN_FRONTEND=noninteractive cuda-ci bash -c \ + "apt-get update -y && apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev" python3 -m pip install --upgrade pip setuptools==65.7 python3 -m pip install .[test,nvworker] make postinstall - sudo DEBIAN_FRONTEND=noninteractive apt-get update - sudo DEBIAN_FRONTEND=noninteractive apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev displayName: Install dependencies - script: | python3 setup.py lint From e0da4c43c3ea55dfa1056ea7197654f520532fab Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 22 Nov 2024 19:13:09 -0800 Subject: [PATCH 03/10] Update Update. --- .azure-pipelines/cuda-unit-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml index 1885b7c85..d53593850 100644 --- a/.azure-pipelines/cuda-unit-test.yml +++ b/.azure-pipelines/cuda-unit-test.yml @@ -20,7 +20,7 @@ steps: displayName: Export path - script: | docker exec -t -u root -e DEBIAN_FRONTEND=noninteractive cuda-ci bash -c \ - "apt-get update -y && apt-get install -y ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev" + "apt-get update -y -q && apt-get install -y -q ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev" python3 -m pip install --upgrade pip setuptools==65.7 python3 -m pip install .[test,nvworker] make postinstall From 2a240b63babe29772364d79861ce3f0639e55fd1 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 22 Nov 2024 19:40:48 -0800 Subject: [PATCH 04/10] Fix build Fix build. --- .azure-pipelines/cuda-unit-test.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml index d53593850..649a21f44 100644 --- a/.azure-pipelines/cuda-unit-test.yml +++ b/.azure-pipelines/cuda-unit-test.yml @@ -20,7 +20,8 @@ steps: displayName: Export path - script: | docker exec -t -u root -e DEBIAN_FRONTEND=noninteractive cuda-ci bash -c \ - "apt-get update -y -q && apt-get install -y -q ffmpeg libavcodec-dev libavformat-dev libavutil-dev libswresample-dev" + "apt-get update -y -q && yes | apt-get install -y -q \ + ffmpeg libavcodec-dev libavformat-dev libavutil-dev libboost-program-options-dev libswresample-dev sudo" python3 -m pip install --upgrade pip setuptools==65.7 python3 -m pip install .[test,nvworker] make postinstall From 682b3ce591007f10433ce799b6a8b8d225aeaeab Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Fri, 22 Nov 2024 21:38:43 -0800 Subject: [PATCH 05/10] Fix build Fix build. --- .azure-pipelines/cuda-unit-test.yml | 6 ++++-- tests/analyzer/test_summaryop.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.azure-pipelines/cuda-unit-test.yml b/.azure-pipelines/cuda-unit-test.yml index 649a21f44..36f03d242 100644 --- a/.azure-pipelines/cuda-unit-test.yml +++ b/.azure-pipelines/cuda-unit-test.yml @@ -20,8 +20,10 @@ steps: displayName: Export path - script: | docker exec -t -u root -e DEBIAN_FRONTEND=noninteractive cuda-ci bash -c \ - "apt-get update -y -q && yes | apt-get install -y -q \ - ffmpeg libavcodec-dev libavformat-dev libavutil-dev libboost-program-options-dev libswresample-dev sudo" + "apt-get update -y -q && \ + yes '' | apt-get install -y -q sudo && \ + apt-get install -y -q \ + ffmpeg libavcodec-dev libavformat-dev libavutil-dev libboost-program-options-dev libswresample-dev" python3 -m pip install --upgrade pip setuptools==65.7 python3 -m pip install .[test,nvworker] make postinstall diff --git a/tests/analyzer/test_summaryop.py b/tests/analyzer/test_summaryop.py index 3b1054444..889ebc1e8 100644 --- a/tests/analyzer/test_summaryop.py +++ b/tests/analyzer/test_summaryop.py @@ -4,7 +4,7 @@ """Tests for SummaryOp module.""" import unittest -from numpy import NaN, float64 +from numpy import nan, float64 import pandas as pd @@ -55,7 +55,7 @@ def test_rule_op(self): # Test - std result = SummaryOp.std(raw_data_df) print(result) - expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, NaN], index=['a', 'b', 'c', 'd'], dtype=float64) + expectedResult = pd.Series([3.0, 3.0, 2.1213203435596424, nan], index=['a', 'b', 'c', 'd'], dtype=float64) pd.testing.assert_series_equal(result, expectedResult) # Test - count result = SummaryOp.count(raw_data_df) From 957983b65e38dbefbf113a4b4a94ebff9ce41834 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Mon, 25 Nov 2024 23:07:38 -0800 Subject: [PATCH 06/10] Fix unit test Fix unit test. --- setup.py | 2 +- superbench/benchmarks/base.py | 2 ++ .../benchmarks/micro_benchmarks/_export_torch_to_onnx.py | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 738095889..584aed22e 100644 --- a/setup.py +++ b/setup.py @@ -211,7 +211,7 @@ def run(self): 'torch': [ 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', - 'transformers>=4.3.3, <4.23.0', + 'transformers>=4.28.0', ], 'ort': [ 'onnx>=1.10.2', diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 86c6b6d15..323f366d8 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -48,6 +48,8 @@ def __init__(self, name, parameters=''): allow_abbrev=False, formatter_class=SortedMetavarTypeHelpFormatter, ) + # Fix optionals title in Python 3.10 + self._parser._optionals.title = 'optional arguments:' self._args = None self._curr_run_index = 0 self._result = None diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index 1e37b793d..abb75676d 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -138,7 +138,7 @@ def export_torchvision_model(self, model_name, batch_size=1): model, dummy_input, file_name, - opset_version=10, + opset_version=14, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, input_names=['input'], output_names=['output'], @@ -179,7 +179,7 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512): model, dummy_input, file_name, - opset_version=10, + opset_version=14, do_constant_folding=True, input_names=['input'], output_names=['output'], From 0eb654f5e7cb0640958ff2cc438dd26de5e10c67 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Mon, 25 Nov 2024 23:26:28 -0800 Subject: [PATCH 07/10] Update Update. Co-authored-by: Dilip Patlolla --- docs/getting-started/installation.mdx | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/getting-started/installation.mdx b/docs/getting-started/installation.mdx index 30fdee829..0a582e92f 100644 --- a/docs/getting-started/installation.mdx +++ b/docs/getting-started/installation.mdx @@ -26,7 +26,7 @@ Here're the system requirements for control node. ### Requirements * Latest version of Linux, you're highly encouraged to use Ubuntu 18.04 or later. -* [Python](https://www.python.org/) version 3.6 or later (which can be checked by running `python3 --version`). +* [Python](https://www.python.org/) version 3.7 or later (which can be checked by running `python3 --version`). * [Pip](https://pip.pypa.io/en/stable/installing/) version 18.0 or later (which can be checked by running `python3 -m pip --version`). :::note diff --git a/setup.py b/setup.py index 584aed22e..2474dcbc1 100644 --- a/setup.py +++ b/setup.py @@ -131,17 +131,17 @@ def run(self): 'Operating System :: POSIX', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3 :: Only', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', 'Topic :: System :: Benchmark', 'Topic :: System :: Clustering', 'Topic :: System :: Hardware', ], keywords='benchmark, AI systems', packages=find_packages(exclude=['tests']), - python_requires='>=3.6, <4', + python_requires='>=3.7, <4', use_scm_version={ 'local_scheme': 'node-and-date', 'version_scheme': lambda _: superbench.__version__, From 999cc3342c7141c211c551bdf743970ea38ed720 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Mon, 25 Nov 2024 23:29:31 -0800 Subject: [PATCH 08/10] Fix unit test Fix unit test. --- superbench/benchmarks/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superbench/benchmarks/base.py b/superbench/benchmarks/base.py index 323f366d8..014103744 100644 --- a/superbench/benchmarks/base.py +++ b/superbench/benchmarks/base.py @@ -49,7 +49,7 @@ def __init__(self, name, parameters=''): formatter_class=SortedMetavarTypeHelpFormatter, ) # Fix optionals title in Python 3.10 - self._parser._optionals.title = 'optional arguments:' + self._parser._optionals.title = 'optional arguments' self._args = None self._curr_run_index = 0 self._result = None From 9788f534483f0a83493c6a371d4ac039d96c627c Mon Sep 17 00:00:00 2001 From: pdr Date: Tue, 26 Nov 2024 20:37:22 -0800 Subject: [PATCH 09/10] fix cache tests issues with cuda 12.4 and pytorch 2 (#672) Fix cache tests issues with cuda 12.4 and pytorch 2 https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management: ``` PyTorch uses a caching memory allocator to speed up memory allocations. This allows fast memory deallocation without device synchronizations. However, the unused memory managed by the allocator will still show as if used in nvidia-smi. You can use memory_allocated() and max_memory_allocated() to monitor memory occupied by tensors, and use memory_reserved() and max_memory_reserved() to monitor the total amount of memory managed by the caching allocator. Calling empty_cache() releases all unused cached memory from PyTorch so that those can be used by other GPU applications. However, the occupied GPU memory by tensors will not be freed so it can not increase the amount of GPU memory available for PyTorch. ``` --- .../model_benchmarks/test_pytorch_base.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/tests/benchmarks/model_benchmarks/test_pytorch_base.py b/tests/benchmarks/model_benchmarks/test_pytorch_base.py index d92cd187b..96e1718a0 100644 --- a/tests/benchmarks/model_benchmarks/test_pytorch_base.py +++ b/tests/benchmarks/model_benchmarks/test_pytorch_base.py @@ -250,16 +250,35 @@ def test_pytorch_empty_cache(): # Register mnist benchmark. BenchmarkRegistry.register_benchmark('pytorch-mnist', PytorchMNIST) + # Get initial memory reserved + init_res_memory = torch.cuda.memory_reserved() + # Test cache empty by manually calling torch.cuda.empty_cache(). parameters = '--batch_size 32 --num_warmup 8 --num_steps 64 --model_action train' benchmark = PytorchMNIST('pytorch-mnist', parameters=parameters) + assert (benchmark) assert (benchmark._preprocess()) assert (benchmark._benchmark()) del benchmark - assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] > 0) + + # Get current reserved memory after benchmark + post_bm_res_memory = torch.cuda.memory_reserved() + + # Assert that memory is increased after benchmark + assert (post_bm_res_memory >= init_res_memory) + + # Manually empty cache and get reserved memory + # Calling empty_cache() releases all unused cached memory from PyTorch so that those can be used by + # other GPU applications. However, the occupied GPU memory by tensors will not be freed so it can not + # increase the amount of GPU memory available for PyTorch. + # https://pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management torch.cuda.empty_cache() - assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0) + post_empty_cache_res_memory = torch.cuda.memory_reserved() + + # Assert that some memory is released after manually empty cache. The cache is not guaranteed to be reset + # back to the init_res_memory due to some tensors not being released. + assert (post_empty_cache_res_memory <= post_bm_res_memory) # Test automatic cache empty. context = BenchmarkRegistry.create_benchmark_context( @@ -268,4 +287,4 @@ def test_pytorch_empty_cache(): benchmark = BenchmarkRegistry.launch_benchmark(context) assert (benchmark) - assert (torch.cuda.memory_stats()['reserved_bytes.all.current'] == 0) + assert (torch.cuda.memory_reserved() == post_empty_cache_res_memory) From 7d1f04e89eb98b811eeefb0637055ae209e967e8 Mon Sep 17 00:00:00 2001 From: Yifan Xiong Date: Tue, 26 Nov 2024 20:39:41 -0800 Subject: [PATCH 10/10] Revert changes for transformer Revert changes for transformer. --- setup.py | 2 +- .../benchmarks/micro_benchmarks/_export_torch_to_onnx.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 2474dcbc1..cf9779a08 100644 --- a/setup.py +++ b/setup.py @@ -211,7 +211,7 @@ def run(self): 'torch': [ 'torch>=1.7.0a0', 'torchvision>=0.8.0a0', - 'transformers>=4.28.0', + 'transformers>=4.3.3, <4.23.0', ], 'ort': [ 'onnx>=1.10.2', diff --git a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py index abb75676d..1e37b793d 100644 --- a/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py +++ b/superbench/benchmarks/micro_benchmarks/_export_torch_to_onnx.py @@ -138,7 +138,7 @@ def export_torchvision_model(self, model_name, batch_size=1): model, dummy_input, file_name, - opset_version=14, + opset_version=10, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK, input_names=['input'], output_names=['output'], @@ -179,7 +179,7 @@ def export_benchmark_model(self, model_name, batch_size=1, seq_length=512): model, dummy_input, file_name, - opset_version=14, + opset_version=10, do_constant_folding=True, input_names=['input'], output_names=['output'],