triton-lang · ptillet · Aug 27, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
@@ -215,7 +215,7 @@ jobs:
  - name: Install pip dependencies
  run: |
  python3 -m pip install --upgrade pip
- python3 -m pip install wheel cmake==3.24 ninja pytest-xdist lit
+ python3 -m pip install wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
  - name: Install Triton
  env:
  TRITON_BUILD_WITH_CCACHE: "true"
@@ -239,8 +239,9 @@ jobs:
  echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
  fi
  cd python/test/unit
- python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
+ python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
  python3 -m pytest -s -n 8 language/test_subprocess.py
+ python3 -m pytest -s -n 8 test_debug.py --forked
  # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
  TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
  # Run hopper/test_flashattention.py separately to avoid out of gpu memory
@@ -382,7 +383,10 @@ jobs:
  pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
  cd python/test/unit
  pytest --capture=tee-sys -rfs -n 16 language runtime \
- --ignore=language/test_line_info.py
+ --ignore=language/test_line_info.py \
+ --ignore=test_debug.py
+ # TODO: uncomment
+ # pytest --capture=tee-sys -rfs test_debug.py
  TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
  pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
 
@@ -494,7 +498,7 @@ jobs:
  python3 -m venv ~/.venv
  source ~/.venv/bin/activate
  python3 -m pip install --upgrade pip
- python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit
+ python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
  - name: Install Triton
  env:
  TRITON_BUILD_WITH_CCACHE: "true"

@@ -246,7 +246,7 @@ jobs:
  - name: Install pip dependencies
  run: |
  python3 -m pip install --upgrade pip
- python3 -m pip install wheel cmake==3.24 ninja pytest-xdist lit
+ python3 -m pip install wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
 
  - name: Install Triton
  env:
@@ -274,8 +274,9 @@ jobs:
  echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
  fi
  cd python/test/unit
- python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
+ python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
  python3 -m pytest -s -n 8 language/test_subprocess.py
+ python3 -m pytest -s -n 8 test_debug.py --forked
  # Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
  TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
  # Run hopper/test_flashattention.py separately to avoid out of gpu memory
@@ -387,7 +388,10 @@ jobs:
  pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
  cd python/test/unit
  pytest --capture=tee-sys -rfs -n 16 language runtime \
- --ignore=language/test_line_info.py
+ --ignore=language/test_line_info.py \
+ --ignore=test_debug.py
+ # TODO: uncomment
+ # pytest --capture=tee-sys -rfs test_debug.py
  TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
  pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py
 
@@ -440,7 +444,7 @@ jobs:
  python3 -m venv ~/.venv
  source ~/.venv/bin/activate
  python3 -m pip install --upgrade pip
- python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit
+ python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
  - name: Install Triton
  env:
  TRITON_BUILD_WITH_CCACHE: "true"

@@ -8,11 +8,6 @@
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
 print_path = os.path.join(dir_path, "print_helper.py")
-assert_path = os.path.join(dir_path, "assert_helper.py")
-
-# TODO: bfloat16 after LLVM-15
-assert_types = ["device_assert", "device_assert_passes", "assert", "static_assert", "no_debug", "double_assert"]
-nested_types = [(caller, callee) for caller in ["true", "false", "none"] for callee in ["true", "false", "none"]]
 torch_types = ["int8", "uint8", "int16", "int32", "long", "float16", "float32", "float64"]
 
 
@@ -120,59 +115,3 @@ def test_print(func_type: str, data_type: str, device: str):
  continue
  print(f'Expected line "{line}" {expected_lines[line]} time(s), but saw {actual_lines[line]} time(s)')
  assert all(delta == 0 for delta in diff.values())
-
-
-@pytest.mark.parametrize("func_type", assert_types)
-def test_assert(func_type: str, device: str):
- # The total number of elements in the 1-D tensor to assert on.
- N = 128
-
- proc = subprocess.run(
- [sys.executable, assert_path, "test_assert", func_type, device],
- capture_output=True,
- env={**os.environ, "TRITON_DEBUG": "1"},
- )
- errs = proc.stderr.splitlines()
- num_errs = 0
- for err in errs:
- if "x != 0" in err.decode("utf-8", errors="ignore"):
- num_errs += 1
-
- # Check for segfaults.
- assert all("segmentation fault" not in line.decode("utf-8", errors="ignore").lower() for line in errs)
-
- if func_type == "static_assert" or func_type == "device_assert_passes":
- assert num_errs == 0
- else:
- assert num_errs == N - 1
-
-
-@pytest.mark.parametrize("caller_type, callee_type", nested_types)
-def test_assert_nested(caller_type, callee_type, device):
- # The total number of elements in the 1-D tensor to assert on.
- N = 128
-
- proc = subprocess.run(
- [sys.executable, assert_path, "test_assert_nested", caller_type, callee_type, device],
- capture_output=True,
- )
- errs = proc.stderr.splitlines()
- num_errs = 0
- for err in errs:
- if "x != 0" in err.decode("utf-8", errors="ignore"):
- num_errs += 1
- if caller_type == "none":
- if callee_type == "true":
- assert num_errs == N - 1
- else:
- assert num_errs == 0
- elif caller_type == "true":
- if callee_type == "false":
- assert num_errs == 0
- else:
- assert num_errs == N - 1
- elif caller_type == "false":
- if callee_type == "true":
- assert num_errs == N - 1
- else:
- assert num_errs == 0
@@ -427,23 +427,18 @@ def kernel_add(a, b, o, N: tl.constexpr):
 def test_jit_debug() -> None:
 
  @triton.jit
- def kernel_add(a, b, o, N: tl.constexpr):
- idx = tl.arange(0, N)
- tl.device_assert(idx < 32, "idx < 32")
- tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx))
+ def kernel(tmp):
+ tl.device_assert(tl.load(tmp) == 1, "tmp == 1")
 
  device = torch.cuda.current_device()
- assert len(kernel_add.cache[device]) == 0
- kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
- assert len(kernel_add.cache[device]) == 1
- kernel_add.debug = False
- kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
- assert len(kernel_add.cache[device]) == 2
- kernel_add.debug = True
- kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
- assert len(kernel_add.cache[device]) == 3
- bins = list(kernel_add.cache[device].values())
- assert bins[2].asm['ttir'] != bins[1].asm['ttir']
+ tmp = torch.tensor([1], dtype=torch.int32, device="cuda")
+ assert len(kernel.cache[device]) == 0
+ kernel[(1, )](tmp, debug=False)
+ assert len(kernel.cache[device]) == 1
+ kernel[(1, )](tmp, debug=True)
+ assert len(kernel.cache[device]) == 2
+ bins = list(kernel.cache[device].values())
+ assert bins[0].asm['ttir'] != bins[1].asm['ttir']
 
 
 @triton.jit