Merge pull request #298 from mlcommons/mlperf-inference

dev <- Mlperf inference
mlcommons · Sep 26, 2024 · db60dad · db60dad
2 parents 0a9238f + 03a740f
commit db60dad
Show file tree

Hide file tree

Showing 16 changed files with 188 additions and 76 deletions.
diff --git a/.github/workflows/code-review.yml b/.github/workflows/code-review.yml
@@ -1,22 +1,34 @@
+name: OpenAI Code Review
+
 on:
- pull_request:
+ pull_request_target:
  types: [opened, synchronize]
+ paths:
+ - 'automation/**'
+ - 'script/**'
+ - '!**.md'
+
+permissions:
+ issues: write
+ pull-requests: write
 
 jobs:
- code_review_job:
+ code_review:
  runs-on: ubuntu-latest
- if: github.repository_owner == 'gateoverflow'
- name: ChatGPT Code Review
+ if: github.repository_owner == 'gateoverflow' && github.event.pull_request.changed_files > 0
  steps:
- - name: GenAI Code Review
- uses: cirolini/genai-code-review@v2
- with:
- openai_api_key: ${{ secrets.openai_api_key }}
- github_token: ${{ secrets.GITHUB_TOKEN }}
- github_pr_id: ${{ github.event.number }}
- openai_model: "GPT-4o" # optional
- openai_temperature: 0.5 # optional
- openai_max_tokens: 2048 # optional
- mode: files # files or patch
- language: en # optional, default is 'en'
- custom_prompt: "" # optional
+ # Run code review via OpenAI 
+ # Step to run the OpenAI Code Review using the GATEOverflow action
+ - name: Run OpenAI Code Review
+ uses: GATEOverflow/genai-code-review@v1
+ with:
+ github_token: ${{ secrets.GITHUB_TOKEN }} # GitHub token for authentication
+ openai_api_key: ${{ secrets.OPENAI_API_KEY }} # OpenAI API key for accessing the GPT model
+ github_pr_id: ${{ github.event.pull_request.number }} # ID of the pull request to review
+ openai_model: "gpt-4o" # Model to use for the code review
+ openai_temperature: 0.5 # Temperature setting for the model's output
+ openai_max_tokens: 2048 # Maximum number of tokens for the model's response
+ mode: "files" # Mode of review, can be "files" or "diff"
+ language: "en" # Language for the review output
+ custom_prompt: "" # Optional custom prompt for the model
+ continue-on-error: true # Allow the workflow to continue even if this step fails
diff --git a/.github/workflows/test-mlperf-inference-sdxl.yaml b/.github/workflows/test-mlperf-inference-sdxl.yaml
@@ -2,7 +2,7 @@ name: MLPerf inference SDXL
 
 on:
  schedule:
- - cron: "1 2 * * */3"
+ - cron: "1 2 * * *"
 
 jobs:
  build_reference:

diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml
@@ -2,7 +2,7 @@ name: MLPerf inference SDXL
 
 on:
  schedule:
- - cron: "1 3 * * */3"
+ - cron: "43 1 * * *"
 
 jobs:
  build_reference:

diff --git a/script/app-mlperf-inference-mlcommons-python/_cm.yaml b/script/app-mlperf-inference-mlcommons-python/_cm.yaml
@@ -185,8 +185,9 @@ deps:
  ## Pytorch (CPU)
  - tags: get,generic-python-lib,_torch
  names:
- - ml-engine-pytorch
- - pytorch
+ - torch
+ - ml-engine-pytorch
+ - pytorch
  skip_if_env:
  CM_MODEL:
  - dlrm-v2-99
@@ -838,6 +839,9 @@ variations:
  MLPERF_TVM_TORCH_QUANTIZED_ENGINE: qnnpack
  deps:
  - tags: get,generic-python-lib,_torch
+ names:
+ - torch
+ - pytorch
  - tags: get,tvm
  names:
  - tvm
@@ -865,7 +869,6 @@ variations:
 
  gptj_:
  deps:
- - tags: get,generic-python-lib,_torch
  - tags: get,generic-python-lib,_package.datasets
  - tags: get,generic-python-lib,_package.attrs
  - tags: get,generic-python-lib,_package.accelerate
@@ -1099,6 +1102,10 @@ variations:
  - dlrm-src
  # to force the version
  - tags: get,generic-python-lib,_torch
+ names:
+ - torch
+ - pytorch
+ - ml-engine-pytorch
  version: "1.13.1"
  - tags: get,generic-python-lib,_mlperf_logging
  - tags: get,generic-python-lib,_opencv-python

diff --git a/script/app-mlperf-inference/_cm.yaml b/script/app-mlperf-inference/_cm.yaml
@@ -385,6 +385,7 @@ variations:
  CM_IMAGENET_ACCURACY_DTYPE: int32
  CM_CNNDM_ACCURACY_DTYPE: int32
  CM_LIBRISPEECH_ACCURACY_DTYPE: int8
+ CM_DOCKER_USE_VIRTUAL_PYTHON: no
  prehook_deps:
  - names:
  - nvidia-original-mlperf-inference
@@ -1162,7 +1163,7 @@ variations:
  mlperf-inference-implementation:
  tags: _cuda
  deps:
- - tags: get,cuda-devices
+ - tags: get,cuda-devices,_with-pycuda
  skip_if_env:
  CM_CUDA_DEVICE_PROP_GLOBAL_MEMORY:
  - "yes"

diff --git a/script/build-dockerfile/customize.py b/script/build-dockerfile/customize.py
@@ -180,8 +180,11 @@ def preprocess(i):
 
  f.write(EOL+'# Install python packages' + EOL)
  python = get_value(env, config, 'PYTHON', 'CM_DOCKERFILE_PYTHON')
- f.write('RUN {} -m venv /home/cmuser/venv/cm'.format(python) + " " + EOL)
- f.write('ENV PATH="/home/cmuser/venv/cm/bin:$PATH"' + EOL)
+
+ docker_use_virtual_python = env.get('CM_DOCKER_USE_VIRTUAL_PYTHON', "yes")
+ if str(docker_use_virtual_python).lower() not in [ "no", "0", "false"]:
+ f.write('RUN {} -m venv /home/cmuser/venv/cm'.format(python) + " " + EOL)
+ f.write('ENV PATH="/home/cmuser/venv/cm/bin:$PATH"' + EOL)
  #f.write('RUN . /opt/venv/cm/bin/activate' + EOL)
  f.write('RUN {} -m pip install '.format(python) + " ".join(get_value(env, config, 'python-packages')) + ' ' + pip_extra_flags + ' ' + EOL)
 

diff --git a/script/download-file/run.sh b/script/download-file/run.sh
@@ -1,61 +1,58 @@
 #!/bin/bash
 
-
+# Execute config command if it exists
 if [[ -n ${CM_DOWNLOAD_CONFIG_CMD} ]]; then
- echo ""
- echo "${CM_DOWNLOAD_CONFIG_CMD}"
- eval "${CM_DOWNLOAD_CONFIG_CMD}"
- test $? -eq 0 || exit $?
+ echo -e "\nExecuting: ${CM_DOWNLOAD_CONFIG_CMD}"
+ eval "${CM_DOWNLOAD_CONFIG_CMD}" || exit $?
 fi
 
+# Assume download is required by default
 require_download=1
 
-if [[ "${CM_DOWNLOAD_LOCAL_FILE_PATH}" != "" ]]; then
+# No download needed if a local file path is specified or the tool is 'cmutil'
+if [[ -n "${CM_DOWNLOAD_LOCAL_FILE_PATH}" || ${CM_DOWNLOAD_TOOL} == "cmutil" ]]; then
  require_download=0
 fi
 
-if [[ ${CM_DOWNLOAD_TOOL} == "cmutil" ]]; then
- require_download=0
-
-elif [ -e "${CM_DOWNLOAD_DOWNLOADED_PATH}" ]; then
- if [[ "${CM_DOWNLOAD_CHECKSUM_CMD}" != "" ]]; then
- echo ""
- echo "${CM_DOWNLOAD_CHECKSUM_CMD}"
- eval "${CM_DOWNLOAD_CHECKSUM_CMD}"
- if [ $? -ne 0 ]; then
- # checksum not supposed to fail for locally given file
- if [[ "${CM_DOWNLOAD_LOCAL_FILE_PATH}" != "" ]]; then
- exit 1
- else
- CM_PRE_DOWNLOAD_CLEAN=true
- fi
+# If the file exists, check the checksum if necessary
+if [[ -e "${CM_DOWNLOAD_DOWNLOADED_PATH}" && -n "${CM_DOWNLOAD_CHECKSUM_CMD}" ]]; then
+ echo -e "\nChecking checksum: ${CM_DOWNLOAD_CHECKSUM_CMD}"
+ eval "${CM_DOWNLOAD_CHECKSUM_CMD}"
+
+ if [[ $? -ne 0 ]]; then
+ # If the checksum fails, handle errors based on whether the file is local
+ if [[ -n "${CM_DOWNLOAD_LOCAL_FILE_PATH}" ]]; then
+ echo "Checksum failed for local file. Exiting."
+ exit 1
  else
- require_download="0"
+ echo "Checksum failed. Marking for re-download."
+ CM_PRE_DOWNLOAD_CLEAN=true
  fi
+ else
+ # If checksum succeeds, no download is required
+ require_download=0
  fi
 fi
 
-if [[ ${require_download} == "1" ]]; then
+# Perform download if required
+if [[ ${require_download} == 1 ]]; then
  echo ""
- if [ -n "${CM_PRE_DOWNLOAD_CLEAN}" ] && [ "${CM_PRE_DOWNLOAD_CLEAN,,}" != "false" ]; then
- echo "${CM_PRE_DOWNLOAD_CLEAN_CMD}"
- eval "${CM_PRE_DOWNLOAD_CLEAN_CMD}"
- fi
 
- echo ""
- echo "${CM_DOWNLOAD_CMD}"
- eval "${CM_DOWNLOAD_CMD}"
- test $? -eq 0 || exit $?
+ # If a pre-download clean command is specified and needed, execute it
+ if [[ -n "${CM_PRE_DOWNLOAD_CLEAN}" && "${CM_PRE_DOWNLOAD_CLEAN,,}" != "false" ]]; then
+ echo "Executing pre-download clean: ${CM_PRE_DOWNLOAD_CLEAN_CMD}"
+ eval "${CM_PRE_DOWNLOAD_CLEAN_CMD}" || exit $?
+ fi
 
+ # Execute the download command
+ echo "Downloading: ${CM_DOWNLOAD_CMD}"
+ eval "${CM_DOWNLOAD_CMD}" || exit $?
 fi
 
-if [[ ${CM_DOWNLOAD_TOOL} == "cmutil" || ${require_download} == "1" ]]; then
- if [[ "${CM_DOWNLOAD_CHECKSUM_CMD}" != "" ]]; then
- echo ""
- echo "${CM_DOWNLOAD_CHECKSUM_CMD}"
- eval "${CM_DOWNLOAD_CHECKSUM_CMD}"
- test $? -eq 0 || exit $?
+# Verify checksum again if necessary
+if [[ ${CM_DOWNLOAD_TOOL} == "cmutil" || ${require_download} == 1 ]]; then
+ if [[ -n "${CM_DOWNLOAD_CHECKSUM_CMD}" ]]; then
+ echo -e "\nVerifying checksum after download: ${CM_DOWNLOAD_CHECKSUM_CMD}"
+ eval "${CM_DOWNLOAD_CHECKSUM_CMD}" || exit $?
  fi
 fi
-
-test $? -eq 0 || exit $?
diff --git a/script/get-cuda-devices/_cm.yaml b/script/get-cuda-devices/_cm.yaml
@@ -33,9 +33,25 @@ docker:
 
 new_env_keys:
 - CM_CUDA_DEVICE_*
+- CM_CUDA_NUM_DEVICES
 
 new_state_keys:
 - cm_cuda_device_prop
+- cm_cuda_devices_prop
+- cm_cuda_num_devices
 
 print_files_if_script_error:
 - tmp-run.out
+
+variations:
+ with-pycuda:
+ env:
+ CM_CUDA_DEVICES_DETECT_USING_PYCUDA: 'yes'
+ deps:
+ - tags: get,python3
+ names:
+ - python
+ - python3
+ - tags: get,generic-python-lib,_package.pycuda
+ names:
+ - pycuda
diff --git a/script/get-cuda-devices/customize.py b/script/get-cuda-devices/customize.py
@@ -2,6 +2,15 @@
 import os
 import subprocess
 
+def preprocess(i):
+
+ env = i['env']
+
+ if str(env.get('CM_CUDA_DEVICES_DETECT_USING_PYCUDA', '')).lower() in [ "1", "yes", "true"]:
+ i['run_script_input']['script_name'] = 'detect'
+
+ return {'return':0} 
+
 def postprocess(i):
 
  env = i['env']
@@ -18,22 +27,36 @@ def postprocess(i):
 
  # properties
  p = {}
+ gpu = {}
+
+ gpu_id = -1
 
  for line in lst:
- print (line)
+ #print (line)
 
  j = line.find(':')
+
  if j>=0:
- key = line[:j].strip()
- val = line[j+1:].strip()
+ key = line[:j].strip()
+ val = line[j+1:].strip()
+
+ if key == "GPU Device ID":
+ gpu_id+=1
+ gpu[gpu_id] = {}
 
- p[key] = val
+ if gpu_id < 0:
+ continue
 
- key_env = 'CM_CUDA_DEVICE_PROP_'+key.upper().replace(' ','_')
- env[key_env] = val
+ gpu[gpu_id][key] = val 
+ p[key] = val
+
+ key_env = 'CM_CUDA_DEVICE_PROP_'+key.upper().replace(' ','_')
+ env[key_env] = val
 
+ state['cm_cuda_num_devices'] = gpu_id
+ env['CM_CUDA_NUM_DEVICES'] = gpu_id
 
  state['cm_cuda_device_prop'] = p
+ state['cm_cuda_devices_prop'] = gpu
 
- return {'return':0}
-
+ return {'return':0} 
diff --git a/script/get-cuda-devices/detect.py b/script/get-cuda-devices/detect.py
@@ -0,0 +1,47 @@
+import pycuda.driver as cuda
+import pycuda.autoinit
+
+def get_gpu_info():
+ num_gpus = cuda.Device.count()
+ all_gpu_info = []
+
+ for i in range(num_gpus):
+ device = cuda.Device(i)
+ cuda_runtime_version = cuda.get_version()
+ cuda_runtime_version_str = f"{cuda_runtime_version[0]}.{cuda_runtime_version[1]}"
+
+ gpu_info = {
+ "GPU Device ID": device.pci_bus_id(),
+ "GPU Name": device.name(),
+ "GPU compute capability": f"{device.compute_capability()[0]}.{device.compute_capability()[1]}",
+ "CUDA driver version": f"{cuda.get_driver_version() // 1000}.{(cuda.get_driver_version() % 1000) // 10}",
+ "CUDA runtime version": cuda_runtime_version_str,
+ "Global memory": device.total_memory(),
+ "Max clock rate": f"{device.get_attribute(cuda.device_attribute.CLOCK_RATE)} MHz",
+ "Total amount of shared memory per block": device.get_attribute(cuda.device_attribute.MAX_SHARED_MEMORY_PER_BLOCK),
+ "Total number of registers available per block": device.get_attribute(cuda.device_attribute.MAX_REGISTERS_PER_BLOCK),
+ "Warp size": device.get_attribute(cuda.device_attribute.WARP_SIZE),
+ "Maximum number of threads per multiprocessor": device.get_attribute(cuda.device_attribute.MAX_THREADS_PER_MULTIPROCESSOR),
+ "Maximum number of threads per block": device.get_attribute(cuda.device_attribute.MAX_THREADS_PER_BLOCK),
+ "Max dimension size of a thread block X": device.get_attribute(cuda.device_attribute.MAX_BLOCK_DIM_X),
+ "Max dimension size of a thread block Y": device.get_attribute(cuda.device_attribute.MAX_BLOCK_DIM_Y),
+ "Max dimension size of a thread block Z": device.get_attribute(cuda.device_attribute.MAX_BLOCK_DIM_Z),
+ "Max dimension size of a grid size X": device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_X),
+ "Max dimension size of a grid size Y": device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_Y),
+ "Max dimension size of a grid size Z": device.get_attribute(cuda.device_attribute.MAX_GRID_DIM_Z),
+ }
+
+ all_gpu_info.append(gpu_info)
+
+ return all_gpu_info
+
+
+# Print the GPU information for all available GPUs
+if __name__ == "__main__":
+ gpu_info_list = get_gpu_info()
+ with open ("tmp-run.out", "w") as f:
+ for idx, gpu_info in enumerate(gpu_info_list):
+ print(f"GPU {idx}:")
+ for key, value in gpu_info.items():
+ f.write(f"{key}: {value}\n")
+
diff --git a/script/get-cuda-devices/detect.sh b/script/get-cuda-devices/detect.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+${CM_PYTHON_BIN_WITH_PATH} ${CM_TMP_CURRENT_SCRIPT_PATH}/detect.py
+test $? -eq 0 || exit $?
diff --git a/script/get-ml-model-gptj/_cm.json b/script/get-ml-model-gptj/_cm.json
@@ -231,7 +231,7 @@
  "tags": "get,nvidia,scratch,space"
  },
  {
- "tags": "get,cuda-devices"
+ "tags": "get,cuda-devices,_with-pycuda"
  },
  {
  "tags": "get,ml-model,gpt-j,_fp32,_pytorch",

diff --git a/script/get-ml-model-llama2/_cm.json b/script/get-ml-model-llama2/_cm.json
@@ -223,7 +223,7 @@
  "tags": "get,nvidia,scratch,space"
  },
  {
- "tags": "get,cuda-devices"
+ "tags": "get,cuda-devices,_with-pycuda"
  },
  {
  "tags": "get,ml-model,llama2-70b,_fp32,_pytorch",