From de427486599ae1ad93eeabe280adb96f2c5587ec Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Sat, 21 Sep 2024 15:13:58 +0530 Subject: [PATCH 01/67] added checksums --- script/get-ml-model-dlrm-terabyte/_cm.json | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/script/get-ml-model-dlrm-terabyte/_cm.json b/script/get-ml-model-dlrm-terabyte/_cm.json index 553808932..622488825 100644 --- a/script/get-ml-model-dlrm-terabyte/_cm.json +++ b/script/get-ml-model-dlrm-terabyte/_cm.json @@ -68,7 +68,8 @@ "CM_PACKAGE_URL": "https://dlrm.s3-us-west-1.amazonaws.com/models/tb00_40M.onnx.tar", "CM_UNTAR": "yes", "CM_ML_MODEL_FILE": "tb00_40M.onnx", - "CM_ML_MODEL_DLRM_MAX_INDEX_RANGE": "40000000" + "CM_ML_MODEL_DLRM_MAX_INDEX_RANGE": "40000000", + "CM_DOWNLOAD_CHECKSUM": "763b964eaffe5f86e92cdcb60c5dc0de" } }, "pytorch": { @@ -110,7 +111,8 @@ "CM_PACKAGE_URL": "https://cloud.mlcommons.org/index.php/s/XzfSeLgW8FYfR3S/download", "CM_DAE_EXTRACT_DOWNLOADED": "yes", "CM_DOWNLOAD_FILENAME": "download", - "CM_EXTRACT_UNZIP": "yes" + "CM_EXTRACT_UNZIP": "yes", + "CM_DOWNLOAD_CHECKSUM": "07e76718b52601303bb5c54fc0a3500c" } }, "wget": { @@ -138,7 +140,8 @@ "env": { "CM_ML_MODEL_ACCURACY": "0.8107", "CM_PACKAGE_URL": "https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.pt", - "CM_ML_MODEL_DLRM_MAX_INDEX_RANGE": "10000000" + "CM_ML_MODEL_DLRM_MAX_INDEX_RANGE": "10000000", + "CM_DOWNLOAD_CHECKSUM": "b7cacffcf75f767faa9cb2af397723aa" } }, "onnx,fp32,debug": { @@ -147,7 +150,8 @@ "CM_PACKAGE_URL": "https://dlrm.s3-us-west-1.amazonaws.com/models/tb0875_10M.onnx.tar", "CM_ML_MODEL_DLRM_MAX_INDEX_RANGE": "10000000", "CM_UNTAR": "yes", - "CM_ML_MODEL_FILE": "tb0875_10M.onnx" + "CM_ML_MODEL_FILE": "tb0875_10M.onnx", + "CM_DOWNLOAD_CHECKSUM": "d11255cd9926cda9181a347861e4d263" } }, "weight_sharded": { From 7249b7350e0857c2013abcaf703022753045d50f Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Sat, 21 Sep 2024 18:23:10 +0530 Subject: [PATCH 02/67] corrected pre download clean --- script/download-file/customize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index 0a281bce0..7f6b56eff 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -13,7 +13,7 @@ def preprocess(i): quiet = (env.get('CM_QUIET', False) == 'yes') tool = env.get('CM_DOWNLOAD_TOOL', '') - pre_clean = env.get('CM_PRE_DOWNLOAD_CLEAN', False) + pre_clean = env.get('CM_PRE_DOWNLOAD_CLEAN', True) # xsep = '^&^&' if windows else '&&' xsep = '&&' @@ -208,7 +208,8 @@ def preprocess(i): for x in ['CM_DOWNLOAD_CMD', 'CM_DOWNLOAD_CHECKSUM_CMD']: env[x+'_USED']='YES' if env.get(x,'')!='' else 'NO' else: - env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) + if pre_clean: + env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) return {'return':0} From b2b7dfc25beebdf62a433f4fd001d59d3b2d4425 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Sun, 22 Sep 2024 00:17:52 +0530 Subject: [PATCH 03/67] Disabled check for condition --- script/download-file/run.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/script/download-file/run.sh b/script/download-file/run.sh index b738a7cd1..552b27fad 100644 --- a/script/download-file/run.sh +++ b/script/download-file/run.sh @@ -35,10 +35,8 @@ fi if [[ ${require_download} == "1" ]]; then echo "" - if [ -e "${CM_PRE_DOWNLOAD_CLEAN}" ]; then - echo ${CM_PRE_DOWNLOAD_CLEAN_CMD} - ${CM_PRE_DOWNLOAD_CLEAN_CMD} - fi + echo ${CM_PRE_DOWNLOAD_CLEAN_CMD} + ${CM_PRE_DOWNLOAD_CLEAN_CMD} echo "" echo "${CM_DOWNLOAD_CMD}" From 67dc9489155767a8ca8a969f0536c08398187700 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Mon, 23 Sep 2024 11:22:15 +0530 Subject: [PATCH 04/67] Proper exit for unhandled md5sum errors --- script/download-file/customize.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index 7f6b56eff..1a639aafa 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -108,6 +108,8 @@ def preprocess(i): elif "no such file" in checksum_result.stderr.lower(): #print(f"No file {env['CM_DOWNLOAD_FILENAME']}. Downloading through cmutil.") cmutil_require_download = 1 + elif checksum_result.returncode == 1: + return {"return":1, "error":f"Error while checking checksum: {checksum_result.stderr}"} else: print(f"File {env['CM_DOWNLOAD_FILENAME']} already present, original checksum and computed checksum matches! Skipping Download..") else: From 729a65ede8bcaf3eca6970caafc3b3f09a614205 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Mon, 23 Sep 2024 18:15:48 +0530 Subject: [PATCH 05/67] sdxl scc commit - WIP --- script/run-mlperf-inference-app/_cm.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/script/run-mlperf-inference-app/_cm.yaml b/script/run-mlperf-inference-app/_cm.yaml index 25bbde364..984752102 100644 --- a/script/run-mlperf-inference-app/_cm.yaml +++ b/script/run-mlperf-inference-app/_cm.yaml @@ -242,6 +242,16 @@ variations: CM_RUN_SUBMISSION_CHECKER: 'no' group: submission-generation + scc24-base: + adr: + coco2014-preprocessed: + tags: _size.50 + + scc24-main: + adr: + coco2014-preprocessed: + tags: _size.500 + r2.1: env: CM_MLPERF_INFERENCE_VERSION: '2.1' From 4986d1fc2f1ae0044a225260851fc9b5ce621688 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 23 Sep 2024 07:41:22 -0700 Subject: [PATCH 06/67] Restrict the self-hosted runs to the runner repo --- .github/workflows/test-mlperf-inference-gptj.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-gptj.yml b/.github/workflows/test-mlperf-inference-gptj.yml index 040d00f9b..6728c8851 100644 --- a/.github/workflows/test-mlperf-inference-gptj.yml +++ b/.github/workflows/test-mlperf-inference-gptj.yml @@ -4,15 +4,12 @@ name: MLPerf inference GPT-J on: - push: - branches: [ "main", "dev", "mlperf-inference" ] - paths: - - '.github/workflows/test-mlperf-inference-gptj.yml' - - '**' - - '!**.md' + schedule: + - cron: "1 1 * * */3" jobs: build: + if: github.repository_owner == 'gateoverflow' runs-on: [ self-hosted, linux, x64 ] strategy: fail-fast: false From 30d90d20e6443f74afe172222d2ef3c5b82ec5b8 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 23 Sep 2024 11:45:04 -0700 Subject: [PATCH 07/67] Fix rocm pytorch install --- script/get-generic-python-lib/_cm.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/script/get-generic-python-lib/_cm.json b/script/get-generic-python-lib/_cm.json index 487f79ccb..6143caca2 100644 --- a/script/get-generic-python-lib/_cm.json +++ b/script/get-generic-python-lib/_cm.json @@ -887,6 +887,7 @@ "env": { "CM_GENERIC_PYTHON_PACKAGE_NAME": "torch", "CM_GENERIC_PYTHON_PIP_INDEX_URL": "https://download.pytorch.org/whl/nightly/rocm6.2", + "CM_GENERIC_PYTHON_PIP_EXTRA_INDEX_URL": "", "CM_GENERIC_PYTHON_PIP_UNINSTALL_DEPS": "torch" }, "new_env_keys": [ @@ -1009,8 +1010,9 @@ "torchvision,rocm": { "env": { "CM_GENERIC_PYTHON_PACKAGE_NAME": "torchvision", - "CM_GENERIC_PYTHON_PIP_INDEX_URL": "https://download.pytorch.org/whl/rocm5.6", - "CM_GENERIC_PYTHON_PIP_UNINSTALL_DEPS": "torchvision" + "CM_GENERIC_PYTHON_PIP_INDEX_URL": "https://download.pytorch.org/whl/nightly/rocm6.2", + "CM_GENERIC_PYTHON_PIP_UNINSTALL_DEPS": "torchvision", + "CM_GENERIC_PYTHON_PIP_EXTRA_INDEX_URL": "" }, "new_env_keys": [ "CM_TORCHVISION_VERSION*" From 6373f6270c2cfeae5ef1f098b861c631f3dd491a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 23 Sep 2024 13:06:40 -0700 Subject: [PATCH 08/67] Fixes for SCC24 --- script/app-mlperf-inference-nvidia/_cm.yaml | 4 ++++ .../app-mlperf-inference-nvidia/customize.py | 7 ++++--- .../_cm.yaml | 2 ++ .../customize.py | 17 ++++++++++++----- script/run-mlperf-inference-app/_cm.yaml | 18 ++++++++++++++++++ 5 files changed, 40 insertions(+), 8 deletions(-) diff --git a/script/app-mlperf-inference-nvidia/_cm.yaml b/script/app-mlperf-inference-nvidia/_cm.yaml index d9750841d..9b586f410 100644 --- a/script/app-mlperf-inference-nvidia/_cm.yaml +++ b/script/app-mlperf-inference-nvidia/_cm.yaml @@ -878,6 +878,8 @@ variations: tags: build,nvidia,inference,server - tags: reproduce,mlperf,inference,nvidia,harness,_preprocess_data + names: + - nvidia-preprocess-data inherit_variation_tags: true force_cache: true skip_inherit_variation_groups: @@ -988,6 +990,8 @@ variations: - tags: reproduce,mlperf,inference,nvidia,harness,_preprocess_data inherit_variation_tags: true + names: + - nvidia-preprocess-data skip_inherit_variation_groups: - run-mode - loadgen-scenario diff --git a/script/app-mlperf-inference-nvidia/customize.py b/script/app-mlperf-inference-nvidia/customize.py index 043c070c8..917102701 100644 --- a/script/app-mlperf-inference-nvidia/customize.py +++ b/script/app-mlperf-inference-nvidia/customize.py @@ -73,10 +73,11 @@ def preprocess(i): elif "stable-diffusion" in env["CM_MODEL"]: target_data_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'data', 'coco', 'SDXL') if not os.path.exists(target_data_path): - cmds.append("make download_data BENCHMARKS='stable-diffusion-xl'") + os.makedirs(target_data_path) + #cmds.append("make download_data BENCHMARKS='stable-diffusion-xl'") env['CM_REQUIRE_COCO2014_DOWNLOAD'] = 'yes' - cmds.append(f"cp -r \${CM_DATASET_PATH_ROOT}/captions/captions.tsv {target_data_path}/captions_5k_final.tsv" ) - cmds.append(f"cp -r \${CM_DATASET_PATH_ROOT}/latents/latents.pt {target_data_path}/latents.pt" ) + cmds.append(f"cp -r \$CM_DATASET_PATH_ROOT/captions/captions.tsv {target_data_path}/captions_5k_final.tsv" ) + cmds.append(f"cp -r \$CM_DATASET_PATH_ROOT/latents/latents.pt {target_data_path}/latents.pt" ) fp16_model_path = os.path.join(env['MLPERF_SCRATCH_PATH'], 'models', 'SDXL', 'official_pytorch', 'fp16', 'stable_diffusion_fp16') if not os.path.exists(os.path.dirname(fp16_model_path)): diff --git a/script/clean-nvidia-mlperf-inference-scratch-space/_cm.yaml b/script/clean-nvidia-mlperf-inference-scratch-space/_cm.yaml index 8d74e1e72..079fe309d 100644 --- a/script/clean-nvidia-mlperf-inference-scratch-space/_cm.yaml +++ b/script/clean-nvidia-mlperf-inference-scratch-space/_cm.yaml @@ -10,6 +10,8 @@ tags: - mlperf - inference uid: bb41f6e3608e4e8a +input_mapping: + extra_cache_rm_tags: CM_CLEAN_EXTRA_CACHE_RM_TAGS deps: # Get Nvidia scratch space where data and models get downloaded - tags: get,mlperf,inference,nvidia,scratch,space diff --git a/script/clean-nvidia-mlperf-inference-scratch-space/customize.py b/script/clean-nvidia-mlperf-inference-scratch-space/customize.py index 8980de245..5a0a95e76 100644 --- a/script/clean-nvidia-mlperf-inference-scratch-space/customize.py +++ b/script/clean-nvidia-mlperf-inference-scratch-space/customize.py @@ -16,22 +16,29 @@ def preprocess(i): clean_cmd = '' cache_rm_tags = '' + extra_cache_rm_tags = env.get('CM_CLEAN_EXTRA_CACHE_RM_TAGS', '') if env.get('CM_MODEL', '') == 'sdxl': if env.get('CM_CLEAN_ARTIFACT_NAME', '') == 'downloaded_data': clean_cmd = f"""rm -rf {os.path.join(env['CM_NVIDIA_MLPERF_SCRATCH_PATH'], "data", "coco", "SDXL")} """ - cache_rm_tags = "nvidia-harness,_preprocessed_data,_sdxl" + cache_rm_tags = "nvidia-harness,_preprocess_data,_sdxl" if env.get('CM_CLEAN_ARTIFACT_NAME', '') == 'preprocessed_data': clean_cmd = f"""rm -rf {os.path.join(env['CM_NVIDIA_MLPERF_SCRATCH_PATH'], "preprocessed_data", "coco2014-tokenized-sdxl")} """ - cache_rm_tags = "nvidia-harness,_preprocessed_data,_sdxl" + cache_rm_tags = "nvidia-harness,_preprocess_data,_sdxl" - if clean_cmd != '': - env['CM_RUN_CMD'] = clean_cmd + cache_rm_tags = cache_rm_tags + extra_cache_rm_tags if cache_rm_tags: - r = cm.access({'action': 'rm', 'automation': 'cache', 'tags': cache_rm_tags}) + r = cm.access({'action': 'rm', 'automation': 'cache', 'tags': cache_rm_tags, 'f': True}) + print(r) if r['return'] != 0 and r['return'] != 16: ## ignore missing ones return r + if r['return'] == 0: # cache entry found + if clean_cmd != '': + env['CM_RUN_CMD'] = clean_cmd + else: + if clean_cmd != '': + env['CM_RUN_CMD'] = clean_cmd return {'return':0} diff --git a/script/run-mlperf-inference-app/_cm.yaml b/script/run-mlperf-inference-app/_cm.yaml index 984752102..0d5ce3ea8 100644 --- a/script/run-mlperf-inference-app/_cm.yaml +++ b/script/run-mlperf-inference-app/_cm.yaml @@ -243,14 +243,32 @@ variations: group: submission-generation scc24-base: + base: + - short adr: coco2014-preprocessed: tags: _size.50 + coco2014-dataset: + tags: _size.50 + nvidia-preprocess-data: + extra_cache_tags: "scc24-base" + deps: + - tags: clean,nvidia,scratch,_sdxl,_downloaded-data + extra_cache_rm_tags: scc24-main scc24-main: + base: + - short adr: coco2014-preprocessed: tags: _size.500 + coco2014-dataset: + tags: _size.500 + nvidia-preprocess-data: + extra_cache_tags: "scc24-main" + deps: + - tags: clean,nvidia,scratch,_sdxl,_downloaded-data + extra_cache_rm_tags: scc24-base r2.1: env: From 8ceb31351d9fbc994fa93dc706f1eee9fab889ce Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 23 Sep 2024 13:32:30 -0700 Subject: [PATCH 09/67] Update torchvision for rocm --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 5ecb69b5d..7063e8ec0 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -216,6 +216,7 @@ deps: - tags: get,generic-python-lib,_torchvision names: - ml-engine-torchvision + - torchvision skip_if_env: CM_MODEL: - dlrm-v2-99 @@ -231,6 +232,7 @@ deps: - tags: get,generic-python-lib,_torchvision_cuda names: - ml-engine-torchvision + - torchvision enable_if_env: CM_MLPERF_BACKEND: - pytorch @@ -695,6 +697,8 @@ variations: add_deps_recursive: pytorch: tags: _rocm + torchvision: + tags: _rocm rocm,sdxl: add_deps: From 0d133c9551d9b2cb5b0d10aec9114dcf92c02dd9 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 23 Sep 2024 13:37:24 -0700 Subject: [PATCH 10/67] Update sut config name for SCC24 --- script/run-mlperf-inference-app/_cm.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/script/run-mlperf-inference-app/_cm.yaml b/script/run-mlperf-inference-app/_cm.yaml index 0d5ce3ea8..91c1e2a67 100644 --- a/script/run-mlperf-inference-app/_cm.yaml +++ b/script/run-mlperf-inference-app/_cm.yaml @@ -245,6 +245,8 @@ variations: scc24-base: base: - short + env: + CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX4: scc24-base adr: coco2014-preprocessed: tags: _size.50 @@ -266,6 +268,8 @@ variations: tags: _size.500 nvidia-preprocess-data: extra_cache_tags: "scc24-main" + env: + CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX4: scc24-main deps: - tags: clean,nvidia,scratch,_sdxl,_downloaded-data extra_cache_rm_tags: scc24-base From 14a6a668c2c64f5648b83f6b5c8e140c1700c513 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 23 Sep 2024 14:08:29 -0700 Subject: [PATCH 11/67] Fix starting weights for nvidia mlperf inference sdxl --- script/app-mlperf-inference-nvidia/_cm.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/app-mlperf-inference-nvidia/_cm.yaml b/script/app-mlperf-inference-nvidia/_cm.yaml index 9b586f410..15d6e4519 100644 --- a/script/app-mlperf-inference-nvidia/_cm.yaml +++ b/script/app-mlperf-inference-nvidia/_cm.yaml @@ -423,7 +423,7 @@ variations: group: model env: CM_MODEL: stable-diffusion-xl - CM_NOT_ML_MODEL_STARTING_WEIGHTS_FILENAME: "https://github.com/mlcommons/cm4mlops/blob/main/script/get-ml-model-stable-diffusion/_cm.json#L174" + CM_ML_MODEL_STARTING_WEIGHTS_FILENAME: "https://github.com/mlcommons/cm4mlops/blob/main/script/get-ml-model-stable-diffusion/_cm.json#L174" CM_ML_MODEL_WEIGHT_TRANSFORMATIONS: "quantization, affine fusion" CM_ML_MODEL_INPUTS_DATA_TYPE: int32 CM_ML_MODEL_WEIGHTS_DATA_TYPE: int8 From a4706214911dba500b3bf9fe88f2e731dde15f1d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Mon, 23 Sep 2024 14:42:34 -0700 Subject: [PATCH 12/67] Fix torchaudio installation for rocm --- script/get-generic-python-lib/_cm.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/script/get-generic-python-lib/_cm.json b/script/get-generic-python-lib/_cm.json index 6143caca2..fa78d0d96 100644 --- a/script/get-generic-python-lib/_cm.json +++ b/script/get-generic-python-lib/_cm.json @@ -971,7 +971,8 @@ "env": { "CM_GENERIC_PYTHON_PACKAGE_NAME": "torchaudio", "CM_GENERIC_PYTHON_PIP_INDEX_URL": "https://download.pytorch.org/whl/nightly/rocm6.2", - "CM_GENERIC_PYTHON_PIP_UNINSTALL_DEPS": "torchaudio" + "CM_GENERIC_PYTHON_PIP_UNINSTALL_DEPS": "torchaudio", + "CM_GENERIC_PYTHON_PIP_EXTRA_INDEX_URL": "" }, "new_env_keys": [ "CM_TORCHAUDIO_VERSION*" From d12083efb8ddbf596449f944ea097ff44f10ea7e Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 11:44:16 +0530 Subject: [PATCH 13/67] preclean fixed --- script/download-file/customize.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index b8fc43ec7..97969418c 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -13,7 +13,7 @@ def preprocess(i): quiet = (env.get('CM_QUIET', False) == 'yes') tool = env.get('CM_DOWNLOAD_TOOL', '') - pre_clean = env.get('CM_PRE_DOWNLOAD_CLEAN', True) + pre_clean = env.get('CM_PRE_DOWNLOAD_CLEAN', False) # xsep = '^&^&' if windows else '&&' xsep = '&&' @@ -214,8 +214,7 @@ def preprocess(i): for x in ['CM_DOWNLOAD_CMD', 'CM_DOWNLOAD_CHECKSUM_CMD']: env[x+'_USED']='YES' if env.get(x,'')!='' else 'NO' else: - if pre_clean: - env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) + env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) return {'return':0} From 74030b292e31ccdf4c4aca7f0da8634606ef0fb9 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 14:30:38 +0530 Subject: [PATCH 14/67] deleted checksum for url -> cloud.* --- script/get-ml-model-dlrm-terabyte/_cm.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/script/get-ml-model-dlrm-terabyte/_cm.json b/script/get-ml-model-dlrm-terabyte/_cm.json index 622488825..e9cb28c56 100644 --- a/script/get-ml-model-dlrm-terabyte/_cm.json +++ b/script/get-ml-model-dlrm-terabyte/_cm.json @@ -111,8 +111,7 @@ "CM_PACKAGE_URL": "https://cloud.mlcommons.org/index.php/s/XzfSeLgW8FYfR3S/download", "CM_DAE_EXTRACT_DOWNLOADED": "yes", "CM_DOWNLOAD_FILENAME": "download", - "CM_EXTRACT_UNZIP": "yes", - "CM_DOWNLOAD_CHECKSUM": "07e76718b52601303bb5c54fc0a3500c" + "CM_EXTRACT_UNZIP": "yes" } }, "wget": { From 3566ac15168f12e68ac2b5fa82ebbdee4f6456b7 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 14:49:32 +0530 Subject: [PATCH 15/67] proper handling of pre_clean --- script/download-file/customize.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index 97969418c..4454a659f 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -214,7 +214,12 @@ def preprocess(i): for x in ['CM_DOWNLOAD_CMD', 'CM_DOWNLOAD_CHECKSUM_CMD']: env[x+'_USED']='YES' if env.get(x,'')!='' else 'NO' else: - env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) + # pre_clean variable is used in order to clean the partial download files if checksums are not provided + if env.get('CM_DOWNLOAD_CHECKSUM_FILE', '') == '' and env.get('CM_DOWNLOAD_CHECKSUM', '') == '': + if pre_clean: + env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) + else: + env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) return {'return':0} From 27861a5d657f4253571b9977c65a89625decab16 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 16:19:06 +0530 Subject: [PATCH 16/67] reverted pre clean change --- script/download-file/customize.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/script/download-file/customize.py b/script/download-file/customize.py index 4454a659f..97969418c 100644 --- a/script/download-file/customize.py +++ b/script/download-file/customize.py @@ -214,12 +214,7 @@ def preprocess(i): for x in ['CM_DOWNLOAD_CMD', 'CM_DOWNLOAD_CHECKSUM_CMD']: env[x+'_USED']='YES' if env.get(x,'')!='' else 'NO' else: - # pre_clean variable is used in order to clean the partial download files if checksums are not provided - if env.get('CM_DOWNLOAD_CHECKSUM_FILE', '') == '' and env.get('CM_DOWNLOAD_CHECKSUM', '') == '': - if pre_clean: - env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) - else: - env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) + env['CM_PRE_DOWNLOAD_CLEAN_CMD'] = "rm -f {}".format(env['CM_DOWNLOAD_FILENAME']) return {'return':0} From 54ae9118295011f24a4c1780403e50b30c0828d4 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 16:37:14 +0530 Subject: [PATCH 17/67] changes for custom sample id generation - SDXL --- script/app-mlperf-inference-mlcommons-python/customize.py | 2 ++ script/get-dataset-coco2014/_cm.yaml | 1 + script/get-dataset-coco2014/customize.py | 3 +++ script/get-dataset-coco2014/run.sh | 3 +++ 4 files changed, 9 insertions(+) diff --git a/script/app-mlperf-inference-mlcommons-python/customize.py b/script/app-mlperf-inference-mlcommons-python/customize.py index d6b1222b0..9d0a64955 100644 --- a/script/app-mlperf-inference-mlcommons-python/customize.py +++ b/script/app-mlperf-inference-mlcommons-python/customize.py @@ -296,6 +296,8 @@ def get_run_cmd_reference(os_info, env, scenario_extra_options, mode_extra_optio scenario_extra_options + mode_extra_options + \ " --output " + env['CM_MLPERF_OUTPUT_DIR'] + \ " --model-path " + env['CM_ML_MODEL_PATH'] + if env.get('CM_COCO2014_SAMPLE_ID_PATH','') != '': + cmd += " --ids-path " + env['CM_COCO2014_SAMPLE_ID_PATH'] elif "llama2-70b" in env['CM_MODEL']: env['RUN_DIR'] = os.path.join(env['CM_MLPERF_INFERENCE_SOURCE'], "language", "llama2-70b") diff --git a/script/get-dataset-coco2014/_cm.yaml b/script/get-dataset-coco2014/_cm.yaml index 690e0926a..aba9b2ea0 100644 --- a/script/get-dataset-coco2014/_cm.yaml +++ b/script/get-dataset-coco2014/_cm.yaml @@ -45,6 +45,7 @@ new_env_keys: - CM_DATASET_ANNOTATIONS_DIR_PATH - CM_DATASET_ANNOTATIONS_FILE_PATH - CM_CALIBRATION_DATASET_PATH +- CM_COCO2014_SAMPLE_ID_PATH posthook_deps: - enable_if_env: diff --git a/script/get-dataset-coco2014/customize.py b/script/get-dataset-coco2014/customize.py index b6984bca1..a38f336bc 100644 --- a/script/get-dataset-coco2014/customize.py +++ b/script/get-dataset-coco2014/customize.py @@ -16,6 +16,9 @@ def preprocess(i): def postprocess(i): env = i['env'] + if env.get('CM_GENERATE_SAMPLE_ID', '') == "yes": + env['CM_COCO2014_SAMPLE_ID_PATH'] = os.path.join(os.getcwd(), 'install', 'sample_ids.txt') + print(env['CM_COCO2014_SAMPLE_ID_PATH']) if env.get('CM_DATASET_CALIBRATION','') == "no": env['CM_DATASET_PATH_ROOT'] = os.path.join(os.getcwd(), 'install') #env['CM_DATASET_PATH'] = os.path.join(os.getcwd(), 'install', 'validation', 'data') diff --git a/script/get-dataset-coco2014/run.sh b/script/get-dataset-coco2014/run.sh index f37ba603b..0324ea162 100644 --- a/script/get-dataset-coco2014/run.sh +++ b/script/get-dataset-coco2014/run.sh @@ -33,6 +33,9 @@ else eval $cmd test $? -eq 0 || exit 1 fi +if [[ ${CM_GENERATE_COCO2014_SAMPLE_ID} == "yes" ]]; then + cmd="python3 sample_ids.py --tsv-path ${INSTALL_DIR}" +fi cd ${INSTALL_DIR} test $? -eq 0 || exit 1 From dc9bf9948bfb888d66167da48a0162a356de1f03 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 16:37:46 +0530 Subject: [PATCH 18/67] changes for custom sample id generation - SDXL --- script/run-mlperf-inference-app/_cm.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/script/run-mlperf-inference-app/_cm.yaml b/script/run-mlperf-inference-app/_cm.yaml index 984752102..2d6adfd8b 100644 --- a/script/run-mlperf-inference-app/_cm.yaml +++ b/script/run-mlperf-inference-app/_cm.yaml @@ -246,11 +246,15 @@ variations: adr: coco2014-preprocessed: tags: _size.50 + env: + CM_GENERATE_SAMPLE_ID: 'yes' scc24-main: adr: coco2014-preprocessed: tags: _size.500 + env: + CM_GENERATE_SAMPLE_ID: 'yes' r2.1: env: From 0c98cbd2e0e6c5e59e988904089aac6d62e5482d Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 16:56:08 +0530 Subject: [PATCH 19/67] code clean --- script/download-file/run.sh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/script/download-file/run.sh b/script/download-file/run.sh index 552b27fad..b9bf01933 100644 --- a/script/download-file/run.sh +++ b/script/download-file/run.sh @@ -26,6 +26,9 @@ elif [ -e "${CM_DOWNLOAD_DOWNLOADED_PATH}" ]; then # checksum not supposed to fail for locally given file if [[ "${CM_DOWNLOAD_LOCAL_FILE_PATH}" != "" ]]; then exit 1 + else + echo "Checksum mismatch. Deleting through command: ${CM_PRE_DOWNLOAD_CLEAN_CMD}" + ${CM_PRE_DOWNLOAD_CLEAN_CMD} fi else require_download="0" @@ -35,8 +38,10 @@ fi if [[ ${require_download} == "1" ]]; then echo "" - echo ${CM_PRE_DOWNLOAD_CLEAN_CMD} - ${CM_PRE_DOWNLOAD_CLEAN_CMD} + if [ -e "${CM_PRE_DOWNLOAD_CLEAN}" ]; then + echo ${CM_PRE_DOWNLOAD_CLEAN_CMD} + ${CM_PRE_DOWNLOAD_CLEAN_CMD} + fi echo "" echo "${CM_DOWNLOAD_CMD}" From d8a33bca6414df42876565a2c7bf0f66c799fd9c Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 11:54:07 +0000 Subject: [PATCH 20/67] fixed bug --- script/get-dataset-coco2014/run.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/script/get-dataset-coco2014/run.sh b/script/get-dataset-coco2014/run.sh index 0324ea162..9668e3abd 100644 --- a/script/get-dataset-coco2014/run.sh +++ b/script/get-dataset-coco2014/run.sh @@ -33,8 +33,11 @@ else eval $cmd test $? -eq 0 || exit 1 fi -if [[ ${CM_GENERATE_COCO2014_SAMPLE_ID} == "yes" ]]; then - cmd="python3 sample_ids.py --tsv-path ${INSTALL_DIR}" +if [[ ${CM_GENERATE_SAMPLE_ID} == "yes" ]]; then + cmd="python3 sample_ids.py --tsv-path ${INSTALL_DIR}/captions/captions.tsv --output-path ${INSTALL_DIR}/sample_ids.txt" + echo $cmd + eval $cmd + test $? -eq 0 || exit 1 fi cd ${INSTALL_DIR} From b40ea46bbe1b24c250eb36af58a2e03fd5018a28 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 17:40:13 +0530 Subject: [PATCH 21/67] fix pre download clean --- script/download-file/run.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/script/download-file/run.sh b/script/download-file/run.sh index b9bf01933..0fe414beb 100644 --- a/script/download-file/run.sh +++ b/script/download-file/run.sh @@ -27,8 +27,7 @@ elif [ -e "${CM_DOWNLOAD_DOWNLOADED_PATH}" ]; then if [[ "${CM_DOWNLOAD_LOCAL_FILE_PATH}" != "" ]]; then exit 1 else - echo "Checksum mismatch. Deleting through command: ${CM_PRE_DOWNLOAD_CLEAN_CMD}" - ${CM_PRE_DOWNLOAD_CLEAN_CMD} + CM_PRE_DOWNLOAD_CLEAN=true fi else require_download="0" @@ -38,7 +37,7 @@ fi if [[ ${require_download} == "1" ]]; then echo "" - if [ -e "${CM_PRE_DOWNLOAD_CLEAN}" ]; then + if [ "${CM_PRE_DOWNLOAD_CLEAN}" != "" ]; then echo ${CM_PRE_DOWNLOAD_CLEAN_CMD} ${CM_PRE_DOWNLOAD_CLEAN_CMD} fi From 7c8984f22e019b40c55ba351dd267fac15db1a6e Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 24 Sep 2024 17:44:37 +0530 Subject: [PATCH 22/67] added gh action workflow for sdxl reference and nvidia --- .../workflows/test-mlperf-inference-sdxl.yaml | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 .github/workflows/test-mlperf-inference-sdxl.yaml diff --git a/.github/workflows/test-mlperf-inference-sdxl.yaml b/.github/workflows/test-mlperf-inference-sdxl.yaml new file mode 100644 index 000000000..b4415a612 --- /dev/null +++ b/.github/workflows/test-mlperf-inference-sdxl.yaml @@ -0,0 +1,47 @@ +name: MLPerf inference SDXL + +on: + schedule: + - cron: "1 1 * * */3" + +jobs: + build_reference: + if: github.repository_owner == 'gateoverflow' + runs-on: [ self-hosted, linux, x64 ] + strategy: + fail-fast: false + matrix: + python-version: [ "3.12" ] + backend: [ "pytorch" ] + precision: [ "float16" ] + steps: + - name: Install dependencies + run: | + source gh_action/bin/deactivate || python3 -m venv gh_action + source gh_action/bin/activate + export CM_REPOS=$HOME/GH_CM + cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + - name: Test MLPerf Inference SDXL + run: | + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean + + build_nvidia: + if: github.repository_owner == 'gateoverflow' + runs-on: [ self-hosted, linux, x64 ] + strategy: + fail-fast: false + matrix: + python-version: [ "3.12" ] + backend: [ "tensorrt" ] + precision: [ "float16" ] + implementation: [ "nvidia" ] + steps: + - name: Install dependencies + run: | + source gh_action/bin/deactivate || python3 -m venv gh_action + source gh_action/bin/activate + export CM_REPOS=$HOME/GH_CM + cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + - name: Test MLPerf Inference SDXL + run: | + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean From 735b581a345d5a04f8cd7195f6fbff3a37c538d8 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 24 Sep 2024 07:30:09 -0700 Subject: [PATCH 23/67] Fixes for coco2014 saample ids --- script/get-dataset-coco2014/_cm.yaml | 3 +++ script/get-dataset-coco2014/run.sh | 8 ++++---- script/run-mlperf-inference-app/_cm.yaml | 16 ++++------------ 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/script/get-dataset-coco2014/_cm.yaml b/script/get-dataset-coco2014/_cm.yaml index aba9b2ea0..39c603642 100644 --- a/script/get-dataset-coco2014/_cm.yaml +++ b/script/get-dataset-coco2014/_cm.yaml @@ -84,6 +84,9 @@ variations: env: CM_DATASET_SIZE: '#' group: size + with-sample-ids: + env: + CM_GENERATE_SAMPLE_ID: 'yes' validation: default: true env: diff --git a/script/get-dataset-coco2014/run.sh b/script/get-dataset-coco2014/run.sh index 9668e3abd..61b9ffe52 100644 --- a/script/get-dataset-coco2014/run.sh +++ b/script/get-dataset-coco2014/run.sh @@ -26,19 +26,19 @@ if [[ ${CM_DATASET_CALIBRATION} == "no" ]]; then cmd="./download-coco-2014.sh -d ${INSTALL_DIR} ${max_images}" echo $cmd eval $cmd - test $? -eq 0 || exit 1 + test $? -eq 0 || exit $? else cmd="./download-coco-2014-calibration.sh -d ${INSTALL_DIR}" echo $cmd eval $cmd - test $? -eq 0 || exit 1 + test $? -eq 0 || exit $? fi if [[ ${CM_GENERATE_SAMPLE_ID} == "yes" ]]; then cmd="python3 sample_ids.py --tsv-path ${INSTALL_DIR}/captions/captions.tsv --output-path ${INSTALL_DIR}/sample_ids.txt" echo $cmd eval $cmd - test $? -eq 0 || exit 1 + test $? -eq 0 || exit $? fi cd ${INSTALL_DIR} -test $? -eq 0 || exit 1 +test $? -eq 0 || exit $? diff --git a/script/run-mlperf-inference-app/_cm.yaml b/script/run-mlperf-inference-app/_cm.yaml index e3dcb5279..efb637150 100644 --- a/script/run-mlperf-inference-app/_cm.yaml +++ b/script/run-mlperf-inference-app/_cm.yaml @@ -249,13 +249,9 @@ variations: CM_MLPERF_SUT_NAME_RUN_CONFIG_SUFFIX4: scc24-base adr: coco2014-preprocessed: - tags: _size.50 - env: - CM_GENERATE_SAMPLE_ID: 'yes' + tags: _size.50,_with-sample-ids coco2014-dataset: - tags: _size.50 - env: - CM_GENERATE_SAMPLE_ID: 'yes' + tags: _size.50,_with-sample-ids nvidia-preprocess-data: extra_cache_tags: "scc24-base" deps: @@ -267,13 +263,9 @@ variations: - short adr: coco2014-preprocessed: - tags: _size.500 - env: - CM_GENERATE_SAMPLE_ID: 'yes' + tags: _size.500,_with-sample-ids coco2014-dataset: - tags: _size.500 - env: - CM_GENERATE_SAMPLE_ID: 'yes' + tags: _size.500,_with-sample-ids nvidia-preprocess-data: extra_cache_tags: "scc24-main" env: From 0784740fbb6824daaaba8d3a1ea04e2341448473 Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 24 Sep 2024 20:11:47 +0530 Subject: [PATCH 24/67] removed beam size --- .github/workflows/test-mlperf-inference-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-sdxl.yaml b/.github/workflows/test-mlperf-inference-sdxl.yaml index b4415a612..166781173 100644 --- a/.github/workflows/test-mlperf-inference-sdxl.yaml +++ b/.github/workflows/test-mlperf-inference-sdxl.yaml @@ -23,7 +23,7 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference SDXL run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean build_nvidia: if: github.repository_owner == 'gateoverflow' @@ -44,4 +44,4 @@ jobs: cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} - name: Test MLPerf Inference SDXL run: | - cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --beam_size=1 --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean + cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --implementation=${{ matrix.implementation }} --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean From 29bed25f581fa339900d3324a9da8d2e91538d07 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Tue, 24 Sep 2024 20:36:18 +0530 Subject: [PATCH 25/67] handled false condition in download-file --- script/download-file/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/download-file/run.sh b/script/download-file/run.sh index 0fe414beb..e0b9037c1 100644 --- a/script/download-file/run.sh +++ b/script/download-file/run.sh @@ -37,7 +37,7 @@ fi if [[ ${require_download} == "1" ]]; then echo "" - if [ "${CM_PRE_DOWNLOAD_CLEAN}" != "" ]; then + if [ "${CM_PRE_DOWNLOAD_CLEAN}" != "" ] && [ "${CM_PRE_DOWNLOAD_CLEAN,,}" != "false" ]; then echo ${CM_PRE_DOWNLOAD_CLEAN_CMD} ${CM_PRE_DOWNLOAD_CLEAN_CMD} fi From b89de1df4fc34866ab050b90f4e819cb773c3d30 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Tue, 24 Sep 2024 21:04:25 +0530 Subject: [PATCH 26/67] Cleanup of download-file run.sh --- script/download-file/run.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/script/download-file/run.sh b/script/download-file/run.sh index e0b9037c1..d9848c39e 100644 --- a/script/download-file/run.sh +++ b/script/download-file/run.sh @@ -37,9 +37,9 @@ fi if [[ ${require_download} == "1" ]]; then echo "" - if [ "${CM_PRE_DOWNLOAD_CLEAN}" != "" ] && [ "${CM_PRE_DOWNLOAD_CLEAN,,}" != "false" ]; then - echo ${CM_PRE_DOWNLOAD_CLEAN_CMD} - ${CM_PRE_DOWNLOAD_CLEAN_CMD} + if [ -n "${CM_PRE_DOWNLOAD_CLEAN}" ] && [ "${CM_PRE_DOWNLOAD_CLEAN,,}" != "false" ]; then + echo "${CM_PRE_DOWNLOAD_CLEAN_CMD}" + eval "${CM_PRE_DOWNLOAD_CLEAN_CMD}" fi echo "" From f1ca1eefa3fd7e7ecc61cb93351513931e167f2c Mon Sep 17 00:00:00 2001 From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com> Date: Tue, 24 Sep 2024 21:39:23 +0530 Subject: [PATCH 27/67] Create github action for scc24 sdxl --- .github/workflows/test-scc24-sdxl.yaml | 57 ++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 .github/workflows/test-scc24-sdxl.yaml diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml new file mode 100644 index 000000000..36e76b86d --- /dev/null +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -0,0 +1,57 @@ +name: MLPerf inference SDXL + +on: + schedule: + - cron: "1 1 * * */3" + +jobs: + build_reference: + if: github.repository_owner == 'gateoverflow' + runs-on: [ self-hosted, linux, x64 ] + strategy: + fail-fast: false + matrix: + python-version: [ "3.12" ] + backend: [ "pytorch" ] + precision: [ "float16" ] + device: [ "cuda", "rocm" ] + steps: + - name: Install dependencies + run: | + source gh_action/bin/deactivate || python3 -m venv gh_action + source gh_action/bin/activate + export CM_REPOS=$HOME/GH_CM + cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + - name: Test MLPerf Inference reference SDXL SCC + env: + GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} + run: | + cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=reference --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --quiet --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --precision=float16 --clean | + cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons | + cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet + + build_nvidia: + if: github.repository_owner == 'gateoverflow' + runs-on: [ self-hosted, linux, x64 ] + strategy: + fail-fast: false + matrix: + python-version: [ "3.12" ] + backend: [ "tensorrt" ] + precision: [ "float16" ] + implementation: [ "nvidia" ] + steps: + - name: Install dependencies + run: | + source gh_action/bin/deactivate || python3 -m venv gh_action + source gh_action/bin/activate + export CM_REPOS=$HOME/GH_CM + cm pull repo --url=${{ github.event.pull_request.head.repo.html_url }} --checkout=${{ github.event.pull_request.head.ref }} + - name: Test MLPerf Inference NVIDIA SDXL SCC + env: + GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} + run: | + cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --precision=float16 --clean | + cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons | + cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet + From 088a8d44c73cd439d870747174356ea3dc044432 Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Wed, 25 Sep 2024 00:31:18 +0530 Subject: [PATCH 28/67] added checksum and clean code --- script/get-dataset-mixtral/_cm.json | 22 +++++++++++++++++++--- script/get-dataset-mixtral/run.bat | 7 ------- script/get-dataset-mixtral/run.sh | 8 -------- 3 files changed, 19 insertions(+), 18 deletions(-) delete mode 100644 script/get-dataset-mixtral/run.bat delete mode 100644 script/get-dataset-mixtral/run.sh diff --git a/script/get-dataset-mixtral/_cm.json b/script/get-dataset-mixtral/_cm.json index e0ddd31ec..656665a28 100644 --- a/script/get-dataset-mixtral/_cm.json +++ b/script/get-dataset-mixtral/_cm.json @@ -34,15 +34,31 @@ "openorca-mbxp-gsm8k-combined" ], "uid": "89e7c91444804775", + "prehook_deps": [ + { + "tags": "download-and-extract", + "env": { + "CM_DOWNLOAD_FINAL_ENV_NAME": "CM_DATASET_PREPROCESSED_PATH" + }, + "update_tags_from_env_with_prefix": { + "_url.": [ "CM_PACKAGE_URL" ] + }, + "force_cache": true, + "extra_cache_tags": "mixtral,get-mixtral-dataset" + } + ], "variations": { "mlcommons-storage":{ "default":true, "env":{ - "CM_RCLONE_WINDOWS_URL": "https://inference.mlcommons-storage.org/mixtral_8x7b%%2F2024.06.06_mixtral_15k_v4.pkl", - "CM_RCLONE_LINUX_URL": "https://inference.mlcommons-storage.org/mixtral_8x7b%2F2024.06.06_mixtral_15k_v4.pkl", - "CM_DATASET_FILE_NAME": "2024.06.06_mixtral_15k_v4.pkl" + "CM_PACKAGE_URL": "https://inference.mlcommons-storage.org/mixtral_8x7b%2F2024.06.06_mixtral_15k_v4.pkl", + "CM_DOWNLOAD_FILENAME": "2024.06.06_mixtral_15k_v4.pkl", + "CM_DOWNLOAD_CHECKSUM": "78823c13e0e73e518872105c4b09628b" }, "group": "download-source" } + }, + "print_env_at_the_end" : { + "CM_DATASET_PREPROCESSED_PATH": "Path to the ML model" } } diff --git a/script/get-dataset-mixtral/run.bat b/script/get-dataset-mixtral/run.bat deleted file mode 100644 index bf1e128dd..000000000 --- a/script/get-dataset-mixtral/run.bat +++ /dev/null @@ -1,7 +0,0 @@ -echo. - -rclone copyurl %CM_RCLONE_WINDOWS_URL% . -a -P -IF %ERRORLEVEL% NEQ 0 EXIT 1 - -echo CM_DATASET_PREPROCESSED_PATH=%CD%\%CM_DATASET_FILE_NAME% > tmp-run-env.out -echo %CD%\%CM_DATASET_FILE_NAME% diff --git a/script/get-dataset-mixtral/run.sh b/script/get-dataset-mixtral/run.sh deleted file mode 100644 index ed3b3142f..000000000 --- a/script/get-dataset-mixtral/run.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -echo "" - -rclone copyurl ${CM_RCLONE_LINUX_URL} ./ -a -P -test $? -eq 0 || exit 1 - -echo "CM_DATASET_PREPROCESSED_PATH=$PWD/${CM_DATASET_FILE_NAME}" > tmp-run-env.out From 6fa8c6a79c32646003100312955c7dbef0765c5c Mon Sep 17 00:00:00 2001 From: anandhu-eng Date: Wed, 25 Sep 2024 00:43:19 +0530 Subject: [PATCH 29/67] clean code --- script/get-dataset-mixtral/_cm.json | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/script/get-dataset-mixtral/_cm.json b/script/get-dataset-mixtral/_cm.json index 656665a28..ca354eea4 100644 --- a/script/get-dataset-mixtral/_cm.json +++ b/script/get-dataset-mixtral/_cm.json @@ -7,27 +7,6 @@ "new_env_keys": [ "CM_DATASET_*" ], - "deps":[ - { - "tags": "detect,detect-os" - }, - { - "skip_if_env": { - "CM_HOST_OS_TYPE": [ - "windows" - ] - }, - "tags": "get,rclone" - }, - { - "enable_if_env": { - "CM_HOST_OS_TYPE": [ - "windows" - ] - }, - "tags": "get,rclone" - } - ], "tags": [ "get", "dataset-mixtral", From 39a36849a4167e70ca20abf959dd2bd3a38b0e86 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 10:26:47 +0100 Subject: [PATCH 30/67] Update test-mlperf-inference-sdxl.yaml | Changed conflicting schedule time --- .github/workflows/test-mlperf-inference-sdxl.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-mlperf-inference-sdxl.yaml b/.github/workflows/test-mlperf-inference-sdxl.yaml index 166781173..a5e348336 100644 --- a/.github/workflows/test-mlperf-inference-sdxl.yaml +++ b/.github/workflows/test-mlperf-inference-sdxl.yaml @@ -2,7 +2,7 @@ name: MLPerf inference SDXL on: schedule: - - cron: "1 1 * * */3" + - cron: "1 2 * * */3" jobs: build_reference: @@ -25,7 +25,7 @@ jobs: run: | cm run script --tags=run-mlperf,inference,_submission,_short --submitter="MLCommons" --docker --model=sdxl --backend=${{ matrix.backend }} --device=cuda --scenario=Offline --test_query_count=1 --precision=${{ matrix.precision }} --target_qps=1 --quiet --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --hw_name=gh_action --docker_dt=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean - build_nvidia: + build_nvidia: if: github.repository_owner == 'gateoverflow' runs-on: [ self-hosted, linux, x64 ] strategy: From 85303853abccd3d486bf313a4c8604421237d94d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 15:00:54 +0530 Subject: [PATCH 31/67] Cleanups --- .github/workflows/test-scc24-sdxl.yaml | 5 ++--- script/get-dataset-mixtral/_cm.json | 3 --- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 36e76b86d..3ced30e04 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -2,7 +2,7 @@ name: MLPerf inference SDXL on: schedule: - - cron: "1 1 * * */3" + - cron: "1 3 * * */3" jobs: build_reference: @@ -14,7 +14,7 @@ jobs: python-version: [ "3.12" ] backend: [ "pytorch" ] precision: [ "float16" ] - device: [ "cuda", "rocm" ] + device: [ "cuda" ] steps: - name: Install dependencies run: | @@ -54,4 +54,3 @@ jobs: cm run script --tags=run-mlperf,inference,_find-performance,_r4.1-dev,_short,_scc24-base --model=sdxl --implementation=nvidia --backend=${{ matrix.backend }} --category=datacenter --scenario=Offline --execution_mode=test --device=${{ matrix.device }} --precision=${{ matrix.precision }} --docker --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --docker_dt=yes --quiet --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --precision=float16 --clean | cm run script --tags=generate,inference,submission --clean --preprocess_submission=yes --run-checker --tar=yes --env.CM_TAR_OUTFILE=submission.tar.gz --division=open --category=datacenter --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes --run_style=test --adr.submission-checker.tags=_short-run --quiet --submitter=MLCommons | cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/cm4mlperf-inference --repo_branch=mlperf-inference-results-scc24 --commit_message="Results from self hosted Github actions - NVIDIARTX4090" --quiet - diff --git a/script/get-dataset-mixtral/_cm.json b/script/get-dataset-mixtral/_cm.json index ca354eea4..4dfbc82e0 100644 --- a/script/get-dataset-mixtral/_cm.json +++ b/script/get-dataset-mixtral/_cm.json @@ -36,8 +36,5 @@ }, "group": "download-source" } - }, - "print_env_at_the_end" : { - "CM_DATASET_PREPROCESSED_PATH": "Path to the ML model" } } From d1957bf9f5680623b4b48246079da967ea72509a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 15:03:53 +0530 Subject: [PATCH 32/67] Fix precision for gptj test --- .github/workflows/test-mlperf-inference-gptj.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-gptj.yml b/.github/workflows/test-mlperf-inference-gptj.yml index 6728c8851..5a7ecc7e8 100644 --- a/.github/workflows/test-mlperf-inference-gptj.yml +++ b/.github/workflows/test-mlperf-inference-gptj.yml @@ -16,7 +16,7 @@ jobs: matrix: python-version: [ "3.12" ] backend: [ "pytorch" ] - precision: [ "bfloat16" ] + precision: [ "float16" ] steps: - name: Install dependencies From 21c81709d396462293a134c051c3f0b8e6841384 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 15:11:45 +0530 Subject: [PATCH 33/67] Fix precision for gptj fp16 --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 7063e8ec0..24bc31044 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -354,6 +354,8 @@ deps: - tags: get,ml-model,large-language-model,gptj names: - ml-model + - ml-model-float16 + - ml-model - gptj-model - gpt-j-model enable_if_env: @@ -1251,9 +1253,9 @@ variations: bfloat16: group: precision add_deps_recursive: - ml-model-bfloat16: + ml-model-float16: tags: - _fp32 + _fp16 env: CM_MLPERF_QUANTIZATION: off CM_MLPERF_MODEL_PRECISION: bfloat16 From 70c1f9f0432a0fb2903917a39751b82065dd09ad Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 15:16:10 +0530 Subject: [PATCH 34/67] Fix precision for gptj fp16 --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 24bc31044..12b8c50df 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -355,7 +355,6 @@ deps: names: - ml-model - ml-model-float16 - - ml-model - gptj-model - gpt-j-model enable_if_env: From 8c7a2c69d6f5bb081f2a199de4143e249a15db74 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 15:22:50 +0530 Subject: [PATCH 35/67] Fix precision for gptj fp16 --- script/app-mlperf-inference-mlcommons-python/_cm.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 12b8c50df..452a3f1dd 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -354,7 +354,6 @@ deps: - tags: get,ml-model,large-language-model,gptj names: - ml-model - - ml-model-float16 - gptj-model - gpt-j-model enable_if_env: From bc1036737c2b26846331fd45662c9389e9466c1d Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 15:51:33 +0530 Subject: [PATCH 36/67] Added support for cuda 12.6.1 --- script/install-cuda-prebuilt/_cm.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/script/install-cuda-prebuilt/_cm.json b/script/install-cuda-prebuilt/_cm.json index 9628bd0fc..b77165c0c 100644 --- a/script/install-cuda-prebuilt/_cm.json +++ b/script/install-cuda-prebuilt/_cm.json @@ -131,6 +131,11 @@ "env": { "CM_CUDA_LINUX_FILENAME": "cuda_12.6.0_560.28.03_linux.run" } + }, + "12.6.1": { + "env": { + "CM_CUDA_LINUX_FILENAME": "cuda_12.6.1_560.35.03_linux.run" + } } } } From a877d2129d0233ca01eaab729bca2b3cd5eb1924 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 16:05:37 +0530 Subject: [PATCH 37/67] Support install prefix for cuda install --- script/install-cuda-prebuilt/customize.py | 4 ++++ script/install-cuda-prebuilt/run.sh | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/script/install-cuda-prebuilt/customize.py b/script/install-cuda-prebuilt/customize.py index fb395bb0b..22c07d5df 100644 --- a/script/install-cuda-prebuilt/customize.py +++ b/script/install-cuda-prebuilt/customize.py @@ -18,6 +18,10 @@ def preprocess(i): supported_versions = list(meta['versions'].keys()) return {'return': 1, 'error': "Only CUDA versions {} are supported now".format(', '.join(supported_versions))} + install_prefix = env.get('CM_CUDA_INSTALL_PREFIX', os.getcwd()) + + env['CM_CUDA_INSTALL_PREFIX'] = install_prefix + recursion_spaces = i['recursion_spaces'] nvcc_bin = "nvcc" diff --git a/script/install-cuda-prebuilt/run.sh b/script/install-cuda-prebuilt/run.sh index 88ad70407..de8d76469 100644 --- a/script/install-cuda-prebuilt/run.sh +++ b/script/install-cuda-prebuilt/run.sh @@ -1,8 +1,6 @@ #!/bin/bash -CUR=${PWD} - -INSTALL_DIR=${CUR}/install +INSTALL_DIR=${CM_CUDA_INSTALL_PREFIX}/install cmd="${CM_SUDO} bash ${CM_CUDA_RUN_FILE_PATH} --toolkitpath=${INSTALL_DIR} --defaultroot=${INSTALL_DIR} --toolkit ${CUDA_ADDITIONAL_INSTALL_OPTIONS} --silent --override" echo "${cmd}" From 1171a54b85f6ed3cbf8bd413652df47d7c922dc7 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 11:38:51 +0100 Subject: [PATCH 38/67] Create code-review.yml --- .github/workflows/code-review.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/code-review.yml diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml new file mode 100644 index 000000000..5cde46057 --- /dev/null +++ b/.github/workflows/code-review.yml @@ -0,0 +1,21 @@ +on: + pull_request: + types: [opened, synchronize] + +jobs: + code_review_job: + runs-on: ubuntu-latest + name: ChatGPT Code Review + steps: + - name: GenAI Code Review + uses: cirolini/genai-code-review@v2 + with: + openai_api_key: ${{ secrets.openai_api_key }} + github_token: ${{ secrets.GITHUB_TOKEN }} + github_pr_id: ${{ github.event.number }} + openai_model: "gpt-3.5-turbo" # optional + openai_temperature: 0.5 # optional + openai_max_tokens: 2048 # optional + mode: files # files or patch + language: en # optional, default is 'en' + custom_prompt: "" # optional From 253b0526e26164bfd389ac8bb287f0be08fd0555 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 11:43:55 +0100 Subject: [PATCH 39/67] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 5cde46057..b4ef17b39 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -13,7 +13,7 @@ jobs: openai_api_key: ${{ secrets.openai_api_key }} github_token: ${{ secrets.GITHUB_TOKEN }} github_pr_id: ${{ github.event.number }} - openai_model: "gpt-3.5-turbo" # optional + openai_model: "GPT-4o" # optional openai_temperature: 0.5 # optional openai_max_tokens: 2048 # optional mode: files # files or patch From 545ddebd7070e1b2cdd94771c6a7b52613062ad1 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 11:48:24 +0100 Subject: [PATCH 40/67] Update code-review.yml --- .github/workflows/code-review.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index b4ef17b39..640e33871 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -5,6 +5,7 @@ on: jobs: code_review_job: runs-on: ubuntu-latest + if: github.repository_owner == 'gateoverflow' name: ChatGPT Code Review steps: - name: GenAI Code Review From b0a02ae2eb647af3ea93acf4f64b64183ec26055 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 16:20:29 +0530 Subject: [PATCH 41/67] Support --install_prefix for cuda installation --- script/install-cuda-prebuilt/_cm.json | 1 + 1 file changed, 1 insertion(+) diff --git a/script/install-cuda-prebuilt/_cm.json b/script/install-cuda-prebuilt/_cm.json index b77165c0c..72441ddd0 100644 --- a/script/install-cuda-prebuilt/_cm.json +++ b/script/install-cuda-prebuilt/_cm.json @@ -18,6 +18,7 @@ }, "input_mapping": { "local_run_file_path": "CUDA_RUN_FILE_LOCAL_PATH", + "install_prefix": "CM_CUDA_INSTALL_PREFIX", "skip_sudo": "CUDA_SKIP_SUDO" }, "new_env_keys": [ From d57bd30076788e836202361b3be9e0fa8b52f3ba Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 16:29:02 +0530 Subject: [PATCH 42/67] Support --install_prefix for cuda installation --- script/install-cuda-prebuilt/customize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/script/install-cuda-prebuilt/customize.py b/script/install-cuda-prebuilt/customize.py index 22c07d5df..f2f0b8f8b 100644 --- a/script/install-cuda-prebuilt/customize.py +++ b/script/install-cuda-prebuilt/customize.py @@ -32,8 +32,8 @@ def preprocess(i): extra_options += " --driver" env['CUDA_ADDITIONAL_INSTALL_OPTIONS'] = extra_options - env['CM_CUDA_INSTALLED_PATH'] = os.path.join(os.getcwd(), 'install') - env['CM_NVCC_BIN_WITH_PATH'] = os.path.join(os.getcwd(), 'install', 'bin', nvcc_bin) + env['CM_CUDA_INSTALLED_PATH'] = os.path.join(install_prefix, 'install') + env['CM_NVCC_BIN_WITH_PATH'] = os.path.join(install_prefix, 'install', 'bin', nvcc_bin) env['CM_GET_DEPENDENT_CACHED_PATH'] = env['CM_NVCC_BIN_WITH_PATH'] # Set CUDA_RUN_FILE_LOCAL_PATH to empty if not set for backwards compatibility in download file From c1f2139d4ef12c283168f104a8a7898ef594eb49 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 16:50:05 +0530 Subject: [PATCH 43/67] Support --extra_install_args for cuda installation --- script/install-cuda-prebuilt/_cm.json | 3 ++- script/install-cuda-prebuilt/customize.py | 7 +++++++ script/install-cuda-prebuilt/run.sh | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/script/install-cuda-prebuilt/_cm.json b/script/install-cuda-prebuilt/_cm.json index 72441ddd0..bf7965812 100644 --- a/script/install-cuda-prebuilt/_cm.json +++ b/script/install-cuda-prebuilt/_cm.json @@ -19,7 +19,8 @@ "input_mapping": { "local_run_file_path": "CUDA_RUN_FILE_LOCAL_PATH", "install_prefix": "CM_CUDA_INSTALL_PREFIX", - "skip_sudo": "CUDA_SKIP_SUDO" + "skip_sudo": "CUDA_SKIP_SUDO", + "override-driver-check": "CM_CUDA_DRIVER_INSTALL_OVERRIDE" }, "new_env_keys": [ "CM_CUDA_*", diff --git a/script/install-cuda-prebuilt/customize.py b/script/install-cuda-prebuilt/customize.py index f2f0b8f8b..ac20aca71 100644 --- a/script/install-cuda-prebuilt/customize.py +++ b/script/install-cuda-prebuilt/customize.py @@ -22,6 +22,11 @@ def preprocess(i): env['CM_CUDA_INSTALL_PREFIX'] = install_prefix + extra_install_args = '' + + if str(env.get('CM_CUDA_DRIVER_INSTALL_OVERRIDE', '')) != '': + extra_install_args += ' --override-driver-check' + recursion_spaces = i['recursion_spaces'] nvcc_bin = "nvcc" @@ -36,6 +41,8 @@ def preprocess(i): env['CM_NVCC_BIN_WITH_PATH'] = os.path.join(install_prefix, 'install', 'bin', nvcc_bin) env['CM_GET_DEPENDENT_CACHED_PATH'] = env['CM_NVCC_BIN_WITH_PATH'] + env['CM_CUDA_EXTRA_INSTALL_ARGS'] = extra_install_args + # Set CUDA_RUN_FILE_LOCAL_PATH to empty if not set for backwards compatibility in download file env['CUDA_RUN_FILE_LOCAL_PATH'] = env.get('CUDA_RUN_FILE_LOCAL_PATH','') diff --git a/script/install-cuda-prebuilt/run.sh b/script/install-cuda-prebuilt/run.sh index de8d76469..c13e96b3b 100644 --- a/script/install-cuda-prebuilt/run.sh +++ b/script/install-cuda-prebuilt/run.sh @@ -2,7 +2,7 @@ INSTALL_DIR=${CM_CUDA_INSTALL_PREFIX}/install -cmd="${CM_SUDO} bash ${CM_CUDA_RUN_FILE_PATH} --toolkitpath=${INSTALL_DIR} --defaultroot=${INSTALL_DIR} --toolkit ${CUDA_ADDITIONAL_INSTALL_OPTIONS} --silent --override" +cmd="${CM_SUDO} bash ${CM_CUDA_RUN_FILE_PATH} --toolkitpath=${INSTALL_DIR} --defaultroot=${INSTALL_DIR} --toolkit ${CUDA_ADDITIONAL_INSTALL_OPTIONS} --silent --override ${CM_CUDA_EXTRA_INSTALL_ARGS}" echo "${cmd}" eval "${cmd}" test $? -eq 0 || exit $? From 5ccf5f729d9f4684abf285396cbaba37e67b885a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 12:28:00 +0100 Subject: [PATCH 44/67] Improve download-file run.sh --- script/download-file/run.sh | 75 ++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/script/download-file/run.sh b/script/download-file/run.sh index d9848c39e..c02e44f00 100644 --- a/script/download-file/run.sh +++ b/script/download-file/run.sh @@ -1,61 +1,58 @@ #!/bin/bash - +# Execute config command if it exists if [[ -n ${CM_DOWNLOAD_CONFIG_CMD} ]]; then - echo "" - echo "${CM_DOWNLOAD_CONFIG_CMD}" - eval "${CM_DOWNLOAD_CONFIG_CMD}" - test $? -eq 0 || exit $? + echo -e "\nExecuting: ${CM_DOWNLOAD_CONFIG_CMD}" + eval "${CM_DOWNLOAD_CONFIG_CMD}" || exit $? fi +# Assume download is required by default require_download=1 -if [[ "${CM_DOWNLOAD_LOCAL_FILE_PATH}" != "" ]]; then +# No download needed if a local file path is specified or the tool is 'cmutil' +if [[ -n "${CM_DOWNLOAD_LOCAL_FILE_PATH}" || ${CM_DOWNLOAD_TOOL} == "cmutil" ]]; then require_download=0 fi -if [[ ${CM_DOWNLOAD_TOOL} == "cmutil" ]]; then - require_download=0 - -elif [ -e "${CM_DOWNLOAD_DOWNLOADED_PATH}" ]; then - if [[ "${CM_DOWNLOAD_CHECKSUM_CMD}" != "" ]]; then - echo "" - echo "${CM_DOWNLOAD_CHECKSUM_CMD}" - eval "${CM_DOWNLOAD_CHECKSUM_CMD}" - if [ $? -ne 0 ]; then - # checksum not supposed to fail for locally given file - if [[ "${CM_DOWNLOAD_LOCAL_FILE_PATH}" != "" ]]; then - exit 1 - else - CM_PRE_DOWNLOAD_CLEAN=true - fi +# If the file exists, check the checksum if necessary +if [[ -e "${CM_DOWNLOAD_DOWNLOADED_PATH}" && -n "${CM_DOWNLOAD_CHECKSUM_CMD}" ]]; then + echo -e "\nChecking checksum: ${CM_DOWNLOAD_CHECKSUM_CMD}" + eval "${CM_DOWNLOAD_CHECKSUM_CMD}" + + if [[ $? -ne 0 ]]; then + # If the checksum fails, handle errors based on whether the file is local + if [[ -n "${CM_DOWNLOAD_LOCAL_FILE_PATH}" ]]; then + echo "Checksum failed for local file. Exiting." + exit 1 else - require_download="0" + echo "Checksum failed. Marking for re-download." + CM_PRE_DOWNLOAD_CLEAN=true fi + else + # If checksum succeeds, no download is required + require_download=0 fi fi -if [[ ${require_download} == "1" ]]; then +# Perform download if required +if [[ ${require_download} == 1 ]]; then echo "" - if [ -n "${CM_PRE_DOWNLOAD_CLEAN}" ] && [ "${CM_PRE_DOWNLOAD_CLEAN,,}" != "false" ]; then - echo "${CM_PRE_DOWNLOAD_CLEAN_CMD}" - eval "${CM_PRE_DOWNLOAD_CLEAN_CMD}" - fi - echo "" - echo "${CM_DOWNLOAD_CMD}" - eval "${CM_DOWNLOAD_CMD}" - test $? -eq 0 || exit $? + # If a pre-download clean command is specified and needed, execute it + if [[ -n "${CM_PRE_DOWNLOAD_CLEAN}" && "${CM_PRE_DOWNLOAD_CLEAN,,}" != "false" ]]; then + echo "Executing pre-download clean: ${CM_PRE_DOWNLOAD_CLEAN_CMD}" + eval "${CM_PRE_DOWNLOAD_CLEAN_CMD}" || exit $? + fi + # Execute the download command + echo "Downloading: ${CM_DOWNLOAD_CMD}" + eval "${CM_DOWNLOAD_CMD}" || exit $? fi -if [[ ${CM_DOWNLOAD_TOOL} == "cmutil" || ${require_download} == "1" ]]; then - if [[ "${CM_DOWNLOAD_CHECKSUM_CMD}" != "" ]]; then - echo "" - echo "${CM_DOWNLOAD_CHECKSUM_CMD}" - eval "${CM_DOWNLOAD_CHECKSUM_CMD}" - test $? -eq 0 || exit $? +# Verify checksum again if necessary +if [[ ${CM_DOWNLOAD_TOOL} == "cmutil" || ${require_download} == 1 ]]; then + if [[ -n "${CM_DOWNLOAD_CHECKSUM_CMD}" ]]; then + echo -e "\nVerifying checksum after download: ${CM_DOWNLOAD_CHECKSUM_CMD}" + eval "${CM_DOWNLOAD_CHECKSUM_CMD}" || exit $? fi fi - -test $? -eq 0 || exit $? From a0775dd09eddc5dba4c9faa8c0549f14b25e946c Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 12:47:57 +0100 Subject: [PATCH 45/67] Update code-review.yml --- .github/workflows/code-review.yml | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 640e33871..c784788b1 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -1,22 +1,28 @@ +name: OpenAI Code Review + on: pull_request: types: [opened, synchronize] jobs: - code_review_job: + code_review: runs-on: ubuntu-latest if: github.repository_owner == 'gateoverflow' - name: ChatGPT Code Review steps: - - name: GenAI Code Review + # Checkout the code + - name: Checkout repository + uses: actions/checkout@v3 + + # Run code review via OpenAI + - name: Run OpenAI Code Review uses: cirolini/genai-code-review@v2 with: - openai_api_key: ${{ secrets.openai_api_key }} github_token: ${{ secrets.GITHUB_TOKEN }} - github_pr_id: ${{ github.event.number }} - openai_model: "GPT-4o" # optional - openai_temperature: 0.5 # optional - openai_max_tokens: 2048 # optional - mode: files # files or patch - language: en # optional, default is 'en' + openai_api_key: ${{ secrets.openai_api_key }} + github_pr_id: ${{ github.event.pull_request.number }} + openai_model: "gpt-4o" + openai_temperature: 0.5 + openai_max_tokens: 2048 + mode: "files" # Options: files, diff + language: "en" custom_prompt: "" # optional From 8484a75b3c0ebbe1f3cd411901a5c8f4451442aa Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 12:59:05 +0100 Subject: [PATCH 46/67] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index c784788b1..c89e6b783 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -15,7 +15,7 @@ jobs: # Run code review via OpenAI - name: Run OpenAI Code Review - uses: cirolini/genai-code-review@v2 + uses: dlidstrom/genai-code-review@v2 with: github_token: ${{ secrets.GITHUB_TOKEN }} openai_api_key: ${{ secrets.openai_api_key }} From a6bad02a5e294934f61a8a0e8cf30937f108170b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 13:04:29 +0100 Subject: [PATCH 47/67] Update code-review.yml --- .github/workflows/code-review.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index c89e6b783..672283c77 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -13,9 +13,9 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - # Run code review via OpenAI + # Run code review via OpenAI - name: Run OpenAI Code Review - uses: dlidstrom/genai-code-review@v2 + uses: dlidstrom/genai-code-review@3 with: github_token: ${{ secrets.GITHUB_TOKEN }} openai_api_key: ${{ secrets.openai_api_key }} From 1f9c4bba966b8dafa37d3dc9169c0769f53051c0 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 13:11:01 +0100 Subject: [PATCH 48/67] Update code-review.yml --- .github/workflows/code-review.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 672283c77..00f7b08e1 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -4,14 +4,14 @@ on: pull_request: types: [opened, synchronize] +permissions: + issues: write + jobs: code_review: runs-on: ubuntu-latest if: github.repository_owner == 'gateoverflow' steps: - # Checkout the code - - name: Checkout repository - uses: actions/checkout@v3 # Run code review via OpenAI - name: Run OpenAI Code Review From 35c9a87d84bfff8414d2e216a6d1a2bc020ff2cd Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 13:21:26 +0100 Subject: [PATCH 49/67] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 00f7b08e1..84d19c5e6 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -15,7 +15,7 @@ jobs: # Run code review via OpenAI - name: Run OpenAI Code Review - uses: dlidstrom/genai-code-review@3 + uses: dlidstrom/genai-code-review@3.0.2 with: github_token: ${{ secrets.GITHUB_TOKEN }} openai_api_key: ${{ secrets.openai_api_key }} From 1c0f623fa63b30599a59e46fcdde9eab34efefa7 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 13:28:20 +0100 Subject: [PATCH 50/67] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 84d19c5e6..8682b7212 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -15,7 +15,7 @@ jobs: # Run code review via OpenAI - name: Run OpenAI Code Review - uses: dlidstrom/genai-code-review@3.0.2 + uses: dlidstrom/genai-code-review@v3 with: github_token: ${{ secrets.GITHUB_TOKEN }} openai_api_key: ${{ secrets.openai_api_key }} From 4f441260b5571cb32244a9b909c098b9cb91bab8 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 13:36:58 +0100 Subject: [PATCH 51/67] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 8682b7212..98692a151 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -15,7 +15,7 @@ jobs: # Run code review via OpenAI - name: Run OpenAI Code Review - uses: dlidstrom/genai-code-review@v3 + uses: dlidstrom/genai-code-review@v3.0.2 with: github_token: ${{ secrets.GITHUB_TOKEN }} openai_api_key: ${{ secrets.openai_api_key }} From 053682d1742a61901124dc111f7e93775fc55520 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 13:47:53 +0100 Subject: [PATCH 52/67] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 98692a151..974732041 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -18,7 +18,7 @@ jobs: uses: dlidstrom/genai-code-review@v3.0.2 with: github_token: ${{ secrets.GITHUB_TOKEN }} - openai_api_key: ${{ secrets.openai_api_key }} + openai_api_key: ${{ secrets.OPENAI_API_KEY }} github_pr_id: ${{ github.event.pull_request.number }} openai_model: "gpt-4o" openai_temperature: 0.5 From 43f35a046c4f1d0452f3b5bb48fd22abaae608b0 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 13:58:22 +0100 Subject: [PATCH 53/67] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 974732041..d24fdc97e 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -18,7 +18,7 @@ jobs: uses: dlidstrom/genai-code-review@v3.0.2 with: github_token: ${{ secrets.GITHUB_TOKEN }} - openai_api_key: ${{ secrets.OPENAI_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} github_pr_id: ${{ github.event.pull_request.number }} openai_model: "gpt-4o" openai_temperature: 0.5 From 1debe15746c06e174eb9df13a886e9e62820d775 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 14:08:34 +0100 Subject: [PATCH 54/67] Update code-review.yml --- .github/workflows/code-review.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index d24fdc97e..c741db9e2 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -15,10 +15,10 @@ jobs: # Run code review via OpenAI - name: Run OpenAI Code Review - uses: dlidstrom/genai-code-review@v3.0.2 + uses: GATEOverflow/genai-code-review@v1 with: github_token: ${{ secrets.GITHUB_TOKEN }} - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + openai_api_key: ${{ secrets.OPENAI_API_KEY }} github_pr_id: ${{ github.event.pull_request.number }} openai_model: "gpt-4o" openai_temperature: 0.5 From 42db1f7d0d3f478de562391ae24dcfdc334fa34b Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 14:21:36 +0100 Subject: [PATCH 55/67] Update code-review.yml --- .github/workflows/code-review.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index c741db9e2..5dc58ed45 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -12,7 +12,6 @@ jobs: runs-on: ubuntu-latest if: github.repository_owner == 'gateoverflow' steps: - # Run code review via OpenAI - name: Run OpenAI Code Review uses: GATEOverflow/genai-code-review@v1 From e5cc9ce06c7d4517f43fea2fd96429f03ddf8aa1 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 14:28:00 +0100 Subject: [PATCH 56/67] Update code-review.yml --- .github/workflows/code-review.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index c741db9e2..ec9fb5df4 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -1,7 +1,7 @@ name: OpenAI Code Review on: - pull_request: + pull_request_target: types: [opened, synchronize] permissions: From 42ed9d375e9ac7e3a1146037252eddd8f7be16e3 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 15:14:30 +0100 Subject: [PATCH 57/67] Update code-review.yml --- .github/workflows/code-review.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index ec9fb5df4..6315c3927 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -6,6 +6,7 @@ on: permissions: issues: write + pull-requests: write jobs: code_review: From 76c9de9ae745cc232ca5ba449b5e424a85dd5b0a Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 18:17:40 +0100 Subject: [PATCH 58/67] Update code-review.yml --- .github/workflows/code-review.yml | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 6315c3927..120511ae4 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -11,19 +11,21 @@ permissions: jobs: code_review: runs-on: ubuntu-latest - if: github.repository_owner == 'gateoverflow' + if: github.repository_owner == 'gateoverflow' && github.event.pull_request.changed_files > 0 steps: # Run code review via OpenAI - - name: Run OpenAI Code Review - uses: GATEOverflow/genai-code-review@v1 - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - openai_api_key: ${{ secrets.OPENAI_API_KEY }} - github_pr_id: ${{ github.event.pull_request.number }} - openai_model: "gpt-4o" - openai_temperature: 0.5 - openai_max_tokens: 2048 - mode: "files" # Options: files, diff - language: "en" - custom_prompt: "" # optional + # Step to run the OpenAI Code Review using the GATEOverflow action + - name: Run OpenAI Code Review + uses: GATEOverflow/genai-code-review@v1 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} # GitHub token for authentication + openai_api_key: ${{ secrets.OPENAI_API_KEY }} # OpenAI API key for accessing the GPT model + github_pr_id: ${{ github.event.pull_request.number }} # ID of the pull request to review + openai_model: "gpt-4o" # Model to use for the code review + openai_temperature: 0.5 # Temperature setting for the model's output + openai_max_tokens: 2048 # Maximum number of tokens for the model's response + mode: "files" # Mode of review, can be "files" or "diff" + language: "en" # Language for the review output + custom_prompt: "" # Optional custom prompt for the model + continue-on-error: true # Allow the workflow to continue even if this step fails From 219ff8c7ecf07dcf199074ba3da6b0a4275821f5 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 18:23:24 +0100 Subject: [PATCH 59/67] Update code-review.yml --- .github/workflows/code-review.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml index 9ee6924fb..258b305f3 100644 --- a/.github/workflows/code-review.yml +++ b/.github/workflows/code-review.yml @@ -3,6 +3,10 @@ name: OpenAI Code Review on: pull_request_target: types: [opened, synchronize] + paths: + - 'automation/**' + - 'script/**' + - '!**.md' permissions: issues: write From 0e1489a9b734c0f87f5067df7337cd6d545ae02f Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 01:13:02 +0530 Subject: [PATCH 60/67] Improve get-cuda-devices to handle multiple GPUs, fixes #288 --- script/get-cuda-devices/_cm.yaml | 14 +++++++++ script/get-cuda-devices/customize.py | 39 ++++++++++++++++++----- script/get-cuda-devices/detect.py | 47 ++++++++++++++++++++++++++++ script/get-cuda-devices/detect.sh | 4 +++ 4 files changed, 96 insertions(+), 8 deletions(-) create mode 100644 script/get-cuda-devices/detect.py create mode 100644 script/get-cuda-devices/detect.sh diff --git a/script/get-cuda-devices/_cm.yaml b/script/get-cuda-devices/_cm.yaml index b91e791af..0c3fcfb23 100644 --- a/script/get-cuda-devices/_cm.yaml +++ b/script/get-cuda-devices/_cm.yaml @@ -36,6 +36,20 @@ new_env_keys: new_state_keys: - cm_cuda_device_prop +- cm_cuda_devices_prop print_files_if_script_error: - tmp-run.out + +variations: + with-pycuda: + env: + CM_CUDA_DEVICES_DETECT_USING_PYCUDA: 'yes' + deps: + - tags: get,python3 + names: + - python + - python3 + - tags: get,generic-python-lib,_package.pycuda + names: + - pycuda diff --git a/script/get-cuda-devices/customize.py b/script/get-cuda-devices/customize.py index 54fa9094f..4aaf21548 100644 --- a/script/get-cuda-devices/customize.py +++ b/script/get-cuda-devices/customize.py @@ -2,6 +2,15 @@ import os import subprocess +def preprocess(i): + + env = i['env'] + + if str(env.get('CM_CUDA_DEVICES_DETECT_USING_PYCUDA', '')).lower() in [ "1", "yes", "true"]: + i['run_script_input']['script_name'] = 'detect' + + return {'return':0} + def postprocess(i): env = i['env'] @@ -18,22 +27,36 @@ def postprocess(i): # properties p = {} + gpu = {} + + gpu_id = -1 for line in lst: - print (line) + #print (line) j = line.find(':') + if j>=0: - key = line[:j].strip() - val = line[j+1:].strip() + key = line[:j].strip() + val = line[j+1:].strip() + + if key == "GPU Device ID": + gpu_id+=1 + gpu[gpu_id] = {} - p[key] = val + if gpu_id < 0: + continue - key_env = 'CM_CUDA_DEVICE_PROP_'+key.upper().replace(' ','_') - env[key_env] = val + gpu[gpu_id][key] = val + p[key] = val + + key_env = 'CM_CUDA_DEVICE_PROP_'+key.upper().replace(' ','_') + env[key_env] = val + state['cm_cuda_num_devices'] = gpu_id + env['CM_CUDA_NUM_DEVICES'] = gpu_id state['cm_cuda_device_prop'] = p + state['cm_cuda_devices_prop'] = gpu - return {'return':0} - + return {'return':0} diff --git a/script/get-cuda-devices/detect.py b/script/get-cuda-devices/detect.py new file mode 100644 index 000000000..817e46a6f --- /dev/null +++ b/script/get-cuda-devices/detect.py @@ -0,0 +1,47 @@ +import pycuda.driver as cuda +import pycuda.autoinit + +def get_gpu_info(): + num_gpus = cuda.Device.count() + all_gpu_info = [] + + for i in range(num_gpus): + device = cuda.Device(i) + cuda_runtime_version = cuda.get_version() + cuda_runtime_version_str = f"{cuda_runtime_version[0]}.{cuda_runtime_version[1]}" + + gpu_info = { + "GPU Device ID": device.pci_bus_id(), + "GPU Name": device.name(), + "GPU compute capability": f"{device.compute_capability()[0]}.{device.compute_capability()[1]}", + "CUDA driver version": f"{cuda.get_driver_version() // 1000}.{(cuda.get_driver_version() % 1000) // 10}", + "CUDA runtime version": cuda_runtime_version_str, + "Global memory": device.total_memory(), + "Max clock rate": f"{device.get_attribute(cuda.device_attribute.CLOCK_RATE)} MHz", + "Total amount of shared memory per block": device.get_attribute(cuda.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK), + "Total number of registers available per block": device.get_attribute(cuda.device_attribute.MAX_REGISTERS_PER_BLOCK), + "Warp size": device.get_attribute(cuda.device_attribute.WARP_SIZE), + "Maximum number of threads per multiprocessor": device.get_attribute(cuda.device_attribute.MAX_THREADS_PER_MULTIPROCESSOR), + "Maximum number of threads per block": device.get_attribute(cuda.device_attribute.MAX_THREADS_PER_BLOCK), + "Max dimension size of a thread block X": device.get_attribute(cuda.device_attribute.MAX_BLOCK_DIM_X), + "Max dimension size of a thread block Y": device.get_attribute(cuda.device_attribute.MAX_BLOCK_DIM_Y), + "Max dimension size of a thread block Z": device.get_attribute(cuda.device_attribute.MAX_BLOCK_DIM_Z), + "Max dimension size of a grid size X": device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_X), + "Max dimension size of a grid size Y": device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_Y), + "Max dimension size of a grid size Z": device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_Z), + } + + all_gpu_info.append(gpu_info) + + return all_gpu_info + + +# Print the GPU information for all available GPUs +if __name__ == "__main__": + gpu_info_list = get_gpu_info() + with open ("tmp-run.out", "w") as f: + for idx, gpu_info in enumerate(gpu_info_list): + print(f"GPU {idx}:") + for key, value in gpu_info.items(): + f.write(f"{key}: {value}\n") + diff --git a/script/get-cuda-devices/detect.sh b/script/get-cuda-devices/detect.sh new file mode 100644 index 000000000..8f6b93596 --- /dev/null +++ b/script/get-cuda-devices/detect.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +${CM_PYTHON_BIN_WITH_PATH} ${CM_TMP_CURRENT_SCRIPT_PATH}/detect.py +test $? -eq 0 || exit $? From f4a1ad293c554b0b139f1d28caca66a7913f8d32 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 01:56:18 +0530 Subject: [PATCH 61/67] Use updated get-cuda-devices in mlperf-inference --- script/app-mlperf-inference/_cm.yaml | 2 +- script/get-ml-model-gptj/_cm.json | 2 +- script/get-ml-model-llama2/_cm.json | 2 +- script/get-mlperf-inference-sut-description/_cm.json | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml index e55c1cdf0..82e67641d 100644 --- a/script/app-mlperf-inference/_cm.yaml +++ b/script/app-mlperf-inference/_cm.yaml @@ -1162,7 +1162,7 @@ variations: mlperf-inference-implementation: tags: _cuda deps: - - tags: get,cuda-devices + - tags: get,cuda-devices,_with-pycuda skip_if_env: CM_CUDA_DEVICE_PROP_GLOBAL_MEMORY: - "yes" diff --git a/script/get-ml-model-gptj/_cm.json b/script/get-ml-model-gptj/_cm.json index 2b9c67e62..396e0d595 100644 --- a/script/get-ml-model-gptj/_cm.json +++ b/script/get-ml-model-gptj/_cm.json @@ -231,7 +231,7 @@ "tags": "get,nvidia,scratch,space" }, { - "tags": "get,cuda-devices" + "tags": "get,cuda-devices,_with-pycuda" }, { "tags": "get,ml-model,gpt-j,_fp32,_pytorch", diff --git a/script/get-ml-model-llama2/_cm.json b/script/get-ml-model-llama2/_cm.json index 0734395d5..d64c6e004 100644 --- a/script/get-ml-model-llama2/_cm.json +++ b/script/get-ml-model-llama2/_cm.json @@ -223,7 +223,7 @@ "tags": "get,nvidia,scratch,space" }, { - "tags": "get,cuda-devices" + "tags": "get,cuda-devices,_with-pycuda" }, { "tags": "get,ml-model,llama2-70b,_fp32,_pytorch", diff --git a/script/get-mlperf-inference-sut-description/_cm.json b/script/get-mlperf-inference-sut-description/_cm.json index a160722c2..f9c1b0345 100644 --- a/script/get-mlperf-inference-sut-description/_cm.json +++ b/script/get-mlperf-inference-sut-description/_cm.json @@ -25,7 +25,7 @@ "tags": "get,compiler" }, { - "tags": "get,cuda-devices", + "tags": "get,cuda-devices,_with-pycuda", "enable_if_env": { "CM_MLPERF_DEVICE": [ "gpu", From eb8910b8d497eace02e7bc0ef05f25c9dcf147f5 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 02:20:29 +0530 Subject: [PATCH 62/67] Improved meta for app-mlperf-inference --- .../app-mlperf-inference-mlcommons-python/_cm.yaml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml index 452a3f1dd..df7a5a1d7 100644 --- a/script/app-mlperf-inference-mlcommons-python/_cm.yaml +++ b/script/app-mlperf-inference-mlcommons-python/_cm.yaml @@ -185,8 +185,9 @@ deps: ## Pytorch (CPU) - tags: get,generic-python-lib,_torch names: - - ml-engine-pytorch - - pytorch + - torch + - ml-engine-pytorch + - pytorch skip_if_env: CM_MODEL: - dlrm-v2-99 @@ -838,6 +839,9 @@ variations: MLPERF_TVM_TORCH_QUANTIZED_ENGINE: qnnpack deps: - tags: get,generic-python-lib,_torch + names: + - torch + - pytorch - tags: get,tvm names: - tvm @@ -865,7 +869,6 @@ variations: gptj_: deps: - - tags: get,generic-python-lib,_torch - tags: get,generic-python-lib,_package.datasets - tags: get,generic-python-lib,_package.attrs - tags: get,generic-python-lib,_package.accelerate @@ -1099,6 +1102,10 @@ variations: - dlrm-src # to force the version - tags: get,generic-python-lib,_torch + names: + - torch + - pytorch + - ml-engine-pytorch version: "1.13.1" - tags: get,generic-python-lib,_mlperf_logging - tags: get,generic-python-lib,_opencv-python From 8d7d254ff6be6bb2b7a6f5ec609edea4cdc02f57 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 02:48:37 +0530 Subject: [PATCH 63/67] Stop the remaining mlperf runs for docker detached mode --- script/run-mlperf-inference-app/customize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/script/run-mlperf-inference-app/customize.py b/script/run-mlperf-inference-app/customize.py index 0722c016e..e2361f2de 100644 --- a/script/run-mlperf-inference-app/customize.py +++ b/script/run-mlperf-inference-app/customize.py @@ -133,6 +133,7 @@ def preprocess(i): ad = inp.get('ad', {}) adr = inp.get('adr', {}) docker_it = inp.get('docker_it', '') + docker_dt = inp.get('docker_dt', '') adr_from_meta = i['run_script_input'].get('add_deps_recursive') for key in adr_from_meta: @@ -237,7 +238,7 @@ def preprocess(i): env['CM_MLPERF_INFERENCE_RESULTS_DIR_'] = os.path.join(env['OUTPUT_BASE_DIR'], f"{env['CM_MLPERF_RUN_STYLE']}_results") if action == "docker": - if str(docker_it).lower() not in ["no", "false", "0"]: + if str(docker_dt).lower() not in ["yes", "true", "1"]: print(f"\nStop Running loadgen scenario: {scenario} and mode: {mode}") return {'return': 0} # We run commands interactively inside the docker container else: From 042079b83034d5232464795a48aec3ec497dbbd1 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 02:56:54 +0530 Subject: [PATCH 64/67] Fix number of accelerators (GPUs) for mlperf-inference --- script/get-cuda-devices/_cm.yaml | 2 ++ script/get-mlperf-inference-sut-description/customize.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/script/get-cuda-devices/_cm.yaml b/script/get-cuda-devices/_cm.yaml index 0c3fcfb23..64d49d95b 100644 --- a/script/get-cuda-devices/_cm.yaml +++ b/script/get-cuda-devices/_cm.yaml @@ -33,10 +33,12 @@ docker: new_env_keys: - CM_CUDA_DEVICE_* +- CM_CUDA_NUM_DEVICES new_state_keys: - cm_cuda_device_prop - cm_cuda_devices_prop +- cm_cuda_num_devices print_files_if_script_error: - tmp-run.out diff --git a/script/get-mlperf-inference-sut-description/customize.py b/script/get-mlperf-inference-sut-description/customize.py index 71636941f..cc36483c6 100644 --- a/script/get-mlperf-inference-sut-description/customize.py +++ b/script/get-mlperf-inference-sut-description/customize.py @@ -100,7 +100,8 @@ def preprocess(i): state['CM_SUT_META']['accelerator_frequency'] = state['cm_cuda_device_prop']['Max clock rate'] state['CM_SUT_META']['accelerator_memory_capacity'] = str(int(state['cm_cuda_device_prop']['Global memory'])/(1024*1024.0*1024)) + " GB" state['CM_SUT_META']['accelerator_model_name'] = state['cm_cuda_device_prop']['GPU Name'] - state['CM_SUT_META']['accelerators_per_node'] = "1" + num_accelerators = env.get('CM_CUDA_NUM_DEVICES', "1") + state['CM_SUT_META']['accelerators_per_node'] = num_accelerators if state['CM_SUT_META'].get('host_processor_core_count', '') == '': physical_cores_per_node = env.get('CM_HOST_CPU_PHYSICAL_CORES_PER_SOCKET') From cd24064d232d6e224012d543bce15bcaee982eb6 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 23:07:20 +0100 Subject: [PATCH 65/67] Update test-mlperf-inference-sdxl.yaml --- .github/workflows/test-mlperf-inference-sdxl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-mlperf-inference-sdxl.yaml b/.github/workflows/test-mlperf-inference-sdxl.yaml index a5e348336..c7d693495 100644 --- a/.github/workflows/test-mlperf-inference-sdxl.yaml +++ b/.github/workflows/test-mlperf-inference-sdxl.yaml @@ -2,7 +2,7 @@ name: MLPerf inference SDXL on: schedule: - - cron: "1 2 * * */3" + - cron: "1 2 * * *" jobs: build_reference: From 3dbea4a03583d31eb1a07a026b6b316028258643 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Wed, 25 Sep 2024 23:08:24 +0100 Subject: [PATCH 66/67] Update test-scc24-sdxl.yaml --- .github/workflows/test-scc24-sdxl.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml index 3ced30e04..e9a2fa410 100644 --- a/.github/workflows/test-scc24-sdxl.yaml +++ b/.github/workflows/test-scc24-sdxl.yaml @@ -2,7 +2,7 @@ name: MLPerf inference SDXL on: schedule: - - cron: "1 3 * * */3" + - cron: "43 1 * * *" jobs: build_reference: From 12c779cd641dcdbc66a6e28388fcd58a122f1422 Mon Sep 17 00:00:00 2001 From: Arjun Suresh Date: Thu, 26 Sep 2024 16:08:19 +0530 Subject: [PATCH 67/67] Dont use venv for nvidia mlperf inference docker --- script/app-mlperf-inference/_cm.yaml | 1 + script/build-dockerfile/customize.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml index 82e67641d..c51b5b515 100644 --- a/script/app-mlperf-inference/_cm.yaml +++ b/script/app-mlperf-inference/_cm.yaml @@ -385,6 +385,7 @@ variations: CM_IMAGENET_ACCURACY_DTYPE: int32 CM_CNNDM_ACCURACY_DTYPE: int32 CM_LIBRISPEECH_ACCURACY_DTYPE: int8 + CM_DOCKER_USE_VIRTUAL_PYTHON: no prehook_deps: - names: - nvidia-original-mlperf-inference diff --git a/script/build-dockerfile/customize.py b/script/build-dockerfile/customize.py index b5bae64fc..41300cdde 100644 --- a/script/build-dockerfile/customize.py +++ b/script/build-dockerfile/customize.py @@ -180,8 +180,11 @@ def preprocess(i): f.write(EOL+'# Install python packages' + EOL) python = get_value(env, config, 'PYTHON', 'CM_DOCKERFILE_PYTHON') - f.write('RUN {} -m venv /home/cmuser/venv/cm'.format(python) + " " + EOL) - f.write('ENV PATH="/home/cmuser/venv/cm/bin:$PATH"' + EOL) + + docker_use_virtual_python = env.get('CM_DOCKER_USE_VIRTUAL_PYTHON', "yes") + if str(docker_use_virtual_python).lower() not in [ "no", "0", "false"]: + f.write('RUN {} -m venv /home/cmuser/venv/cm'.format(python) + " " + EOL) + f.write('ENV PATH="/home/cmuser/venv/cm/bin:$PATH"' + EOL) #f.write('RUN . /opt/venv/cm/bin/activate' + EOL) f.write('RUN {} -m pip install '.format(python) + " ".join(get_value(env, config, 'python-packages')) + ' ' + pip_extra_flags + ' ' + EOL)