Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[frontend] added overflow checks in debug mode #4589

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ jobs:
- name: Install pip dependencies
run: |
python3 -m pip install --upgrade pip
python3 -m pip install wheel cmake==3.24 ninja pytest-xdist lit
python3 -m pip install wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
- name: Install Triton
env:
TRITON_BUILD_WITH_CCACHE: "true"
Expand All @@ -239,8 +239,9 @@ jobs:
echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
fi
cd python/test/unit
python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
python3 -m pytest -s -n 8 language/test_subprocess.py
python3 -m pytest -s -n 8 test_debug.py --forked
# Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
# Run hopper/test_flashattention.py separately to avoid out of gpu memory
Expand Down Expand Up @@ -382,7 +383,10 @@ jobs:
pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
cd python/test/unit
pytest --capture=tee-sys -rfs -n 16 language runtime \
--ignore=language/test_line_info.py
--ignore=language/test_line_info.py \
--ignore=test_debug.py
# TODO: uncomment
# pytest --capture=tee-sys -rfs test_debug.py
TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py

Expand Down Expand Up @@ -494,7 +498,7 @@ jobs:
python3 -m venv ~/.venv
source ~/.venv/bin/activate
python3 -m pip install --upgrade pip
python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit
python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
- name: Install Triton
env:
TRITON_BUILD_WITH_CCACHE: "true"
Expand Down
12 changes: 8 additions & 4 deletions .github/workflows/integration-tests.yml.in
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ jobs:
- name: Install pip dependencies
run: |
python3 -m pip install --upgrade pip
python3 -m pip install wheel cmake==3.24 ninja pytest-xdist lit
python3 -m pip install wheel cmake==3.24 ninja pytest-forked pytest-xdist lit

- name: Install Triton
env:
Expand Down Expand Up @@ -274,8 +274,9 @@ jobs:
echo "Coult not find '${SHARED_LIB_DIR}'" ; exit -1
fi
cd python/test/unit
python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
python3 -m pytest -s -n 8 --ignore=hopper/test_flashattention.py --ignore=language/test_line_info.py --ignore=language/test_subprocess.py --ignore=test_debug.py
python3 -m pytest -s -n 8 language/test_subprocess.py
python3 -m pytest -s -n 8 test_debug.py --forked
# Run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
TRITON_DISABLE_LINE_INFO=0 python3 -m pytest -s language/test_line_info.py
# Run hopper/test_flashattention.py separately to avoid out of gpu memory
Expand Down Expand Up @@ -387,7 +388,10 @@ jobs:
pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
cd python/test/unit
pytest --capture=tee-sys -rfs -n 16 language runtime \
--ignore=language/test_line_info.py
--ignore=language/test_line_info.py \
--ignore=test_debug.py
# TODO: uncomment
# pytest --capture=tee-sys -rfs test_debug.py
TRITON_ALWAYS_COMPILE=1 TRITON_DISABLE_LINE_INFO=0 LLVM_PASS_PLUGIN_PATH=${SHARED_LIB_DIR}/libGPUHello.so \
pytest --capture=tee-sys -rfs -vvv instrumentation/test_gpuhello.py

Expand Down Expand Up @@ -440,7 +444,7 @@ jobs:
python3 -m venv ~/.venv
source ~/.venv/bin/activate
python3 -m pip install --upgrade pip
python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-xdist lit
python3 -m pip install cython setuptools wheel cmake==3.24 ninja pytest-forked pytest-xdist lit
- name: Install Triton
env:
TRITON_BUILD_WITH_CCACHE: "true"
Expand Down
154 changes: 0 additions & 154 deletions python/test/unit/language/assert_helper.py

This file was deleted.

61 changes: 0 additions & 61 deletions python/test/unit/language/test_subprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,6 @@

dir_path = os.path.dirname(os.path.realpath(__file__))
print_path = os.path.join(dir_path, "print_helper.py")
assert_path = os.path.join(dir_path, "assert_helper.py")

# TODO: bfloat16 after LLVM-15
assert_types = ["device_assert", "device_assert_passes", "assert", "static_assert", "no_debug", "double_assert"]
nested_types = [(caller, callee) for caller in ["true", "false", "none"] for callee in ["true", "false", "none"]]
torch_types = ["int8", "uint8", "int16", "int32", "long", "float16", "float32", "float64"]


Expand Down Expand Up @@ -120,59 +115,3 @@ def test_print(func_type: str, data_type: str, device: str):
continue
print(f'Expected line "{line}" {expected_lines[line]} time(s), but saw {actual_lines[line]} time(s)')
assert all(delta == 0 for delta in diff.values())


@pytest.mark.parametrize("func_type", assert_types)
def test_assert(func_type: str, device: str):
# The total number of elements in the 1-D tensor to assert on.
N = 128

proc = subprocess.run(
[sys.executable, assert_path, "test_assert", func_type, device],
capture_output=True,
env={**os.environ, "TRITON_DEBUG": "1"},
)
errs = proc.stderr.splitlines()
num_errs = 0
for err in errs:
if "x != 0" in err.decode("utf-8", errors="ignore"):
num_errs += 1

# Check for segfaults.
assert all("segmentation fault" not in line.decode("utf-8", errors="ignore").lower() for line in errs)

if func_type == "static_assert" or func_type == "device_assert_passes":
assert num_errs == 0
else:
assert num_errs == N - 1


@pytest.mark.parametrize("caller_type, callee_type", nested_types)
def test_assert_nested(caller_type, callee_type, device):
# The total number of elements in the 1-D tensor to assert on.
N = 128

proc = subprocess.run(
[sys.executable, assert_path, "test_assert_nested", caller_type, callee_type, device],
capture_output=True,
)
errs = proc.stderr.splitlines()
num_errs = 0
for err in errs:
if "x != 0" in err.decode("utf-8", errors="ignore"):
num_errs += 1
if caller_type == "none":
if callee_type == "true":
assert num_errs == N - 1
else:
assert num_errs == 0
elif caller_type == "true":
if callee_type == "false":
assert num_errs == 0
else:
assert num_errs == N - 1
elif caller_type == "false":
if callee_type == "true":
assert num_errs == N - 1
else:
assert num_errs == 0
25 changes: 10 additions & 15 deletions python/test/unit/runtime/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,23 +427,18 @@ def kernel_add(a, b, o, N: tl.constexpr):
def test_jit_debug() -> None:

@triton.jit
def kernel_add(a, b, o, N: tl.constexpr):
idx = tl.arange(0, N)
tl.device_assert(idx < 32, "idx < 32")
tl.store(o + idx, tl.load(a + idx) + tl.load(b + idx))
def kernel(tmp):
tl.device_assert(tl.load(tmp) == 1, "tmp == 1")

device = torch.cuda.current_device()
assert len(kernel_add.cache[device]) == 0
kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
assert len(kernel_add.cache[device]) == 1
kernel_add.debug = False
kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
assert len(kernel_add.cache[device]) == 2
kernel_add.debug = True
kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1, ))
assert len(kernel_add.cache[device]) == 3
bins = list(kernel_add.cache[device].values())
assert bins[2].asm['ttir'] != bins[1].asm['ttir']
tmp = torch.tensor([1], dtype=torch.int32, device="cuda")
assert len(kernel.cache[device]) == 0
kernel[(1, )](tmp, debug=False)
assert len(kernel.cache[device]) == 1
kernel[(1, )](tmp, debug=True)
assert len(kernel.cache[device]) == 2
bins = list(kernel.cache[device].values())
assert bins[0].asm['ttir'] != bins[1].asm['ttir']


@triton.jit
Expand Down
Loading
Loading