From ec741c8dd0d78e9a4f2362d1e6b9c3e838271483 Mon Sep 17 00:00:00 2001 From: ksimpson Date: Thu, 24 Oct 2024 14:01:14 -0700 Subject: [PATCH] improve some tests --- cuda_core/tests/notes | 2 + cuda_core/tests/test_device.py | 25 +++-- cuda_core/tests/test_memory.py | 168 ++++++++++++++++++++++++++++---- cuda_core/tests/test_program.py | 8 ++ 4 files changed, 177 insertions(+), 26 deletions(-) create mode 100644 cuda_core/tests/notes diff --git a/cuda_core/tests/notes b/cuda_core/tests/notes new file mode 100644 index 0000000..97fd5a1 --- /dev/null +++ b/cuda_core/tests/notes @@ -0,0 +1,2 @@ +how to query the devices available on the system +how to copy memory from device to host and vice versa, the copy to copy from take buffers which aren't exposed \ No newline at end of file diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index f036e60..4f4af0f 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -2,6 +2,11 @@ from cuda.core.experimental._device import Device from cuda.core.experimental._utils import handle_return, ComputeCapability, CUDAError, \ precondition +import pytest + +@pytest.fixture(scope='module') +def init_cuda(): + Device().set_current() def test_device_initialization(): device = Device() @@ -11,6 +16,15 @@ def test_device_repr(): device = Device() assert str(device).startswith('= 11040: uuid = handle_return(cuda.cuDeviceGetUuid_v2(device.device_id)) @@ -38,16 +51,16 @@ def test_uuid(): assert device.uuid == expected_uuid def test_name(): - device = Device(0) + device = Device() name = handle_return(cuda.cuDeviceGetName(128, device.device_id)) name = name.split(b'\0')[0] assert device.name == name.decode() def test_compute_capability(): - device = Device(0) + device = Device() major = handle_return(cudart.cudaDeviceGetAttribute( cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device.device_id)) minor = handle_return(cudart.cudaDeviceGetAttribute( cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device.device_id)) expected_cc = ComputeCapability(major, minor) - assert device.compute_capability == expected_cc + assert device.compute_capability == expected_cc \ No newline at end of file diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index f815576..3fddc3d 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -4,9 +4,15 @@ from cuda.core.experimental._device import Device from cuda import cuda from cuda.core.experimental._utils import handle_return +import ctypes -class DummyMemoryResource(MemoryResource): - def __init__(self): +@pytest.fixture(scope='module') +def init_cuda(): + Device().set_current() + +class DummyDeviceMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device pass def allocate(self, size, stream=None) -> Buffer: @@ -14,7 +20,56 @@ def allocate(self, size, stream=None) -> Buffer: return Buffer(ptr=ptr, size=size, mr=self) def deallocate(self, ptr, size, stream=None): - handle_return(cuda.cuMemFree(ptr)) + cuda.cuMemFree(ptr) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + @property + def device_id(self) -> int: + return 0 + +class DummyHostMemoryResource(MemoryResource): + def __init__(self): + pass + + def allocate(self, size, stream=None) -> Buffer: + # Allocate a ctypes buffer of size `size` + ptr = (ctypes.c_byte * size)() + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + #the memory is deallocated per the ctypes deallocation at garbage collection time + pass + + @property + def is_device_accessible(self) -> bool: + return False + + @property + def is_host_accessible(self) -> bool: + return True + + @property + def device_id(self) -> int: + raise RuntimeError("the pinned memory resource is not bound to any GPU") + +class DummyUnifiedMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + pass + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(cuda.cuMemAllocManaged(size, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value)) + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + cuda.cuMemFree(ptr) @property def is_device_accessible(self) -> bool: @@ -28,44 +83,117 @@ def is_host_accessible(self) -> bool: def device_id(self) -> int: return 0 -def test_buffer_initialization(): - dummy_mr = DummyMemoryResource() +class DummyPinnedMemoryResource(MemoryResource): + def __init__(self, device): + self.device = device + pass + + def allocate(self, size, stream=None) -> Buffer: + ptr = handle_return(cuda.cuMemAllocHost(size)) + return Buffer(ptr=ptr, size=size, mr=self) + + def deallocate(self, ptr, size, stream=None): + cuda.cuMemFreeHost(ptr) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return True + + @property + def device_id(self) -> int: + raise RuntimeError("the pinned memory resource is not bound to any GPU") + +def buffer_initialization(dummy_mr : MemoryResource): buffer = dummy_mr.allocate(size=1024) assert buffer.handle != 0 assert buffer.size == 1024 assert buffer.memory_resource == dummy_mr - assert buffer.is_device_accessible == True - assert buffer.is_host_accessible == True - assert buffer.device_id == 0 + assert buffer.is_device_accessible == dummy_mr.is_device_accessible + assert buffer.is_host_accessible == dummy_mr.is_host_accessible dummy_mr.deallocate(buffer.handle, buffer.size) -def test_buffer_copy_to(): - dummy_mr = DummyMemoryResource() - src_buffer = dummy_mr.allocate(size=1024) - dst_buffer = dummy_mr.allocate(size=1024) +def test_buffer_initialization(): device = Device() device.set_current() + buffer_initialization(DummyDeviceMemoryResource(device)) + buffer_initialization(DummyHostMemoryResource()) + buffer_initialization(DummyUnifiedMemoryResource(device)) + buffer_initialization(DummyPinnedMemoryResource(device)) + +def buffer_copy_to(dummy_mr : MemoryResource, device : Device, check = False): + src_buffer = dummy_mr.allocate(size=1024) + dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() + + if check: + src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + for i in range(1024): + src_ptr[i] = ctypes.c_byte(i) + src_buffer.copy_to(dst_buffer, stream=stream) - # Assuming cuMemcpyAsync is correctly called, we can't directly check the result here + device.sync() + + if check: + dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + + for i in range(10): + assert dst_ptr[i] == src_ptr[i] + dummy_mr.deallocate(src_buffer.handle, src_buffer.size) dummy_mr.deallocate(dst_buffer.handle, dst_buffer.size) -def test_buffer_copy_from(): - dummy_mr = DummyMemoryResource() - src_buffer = dummy_mr.allocate(size=1024) - dst_buffer = dummy_mr.allocate(size=1024) +def test_buffer_copy_to(): device = Device() device.set_current() + buffer_copy_to(DummyDeviceMemoryResource(device), device) + buffer_copy_to(DummyUnifiedMemoryResource(device), device) + buffer_copy_to(DummyPinnedMemoryResource(device), device, check = True) + +def buffer_copy_from(dummy_mr : MemoryResource, device, check = False): + src_buffer = dummy_mr.allocate(size=1024) + dst_buffer = dummy_mr.allocate(size=1024) stream = device.create_stream() + + if check: + src_ptr = ctypes.cast(src_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + for i in range(1024): + src_ptr[i] = ctypes.c_byte(i) + dst_buffer.copy_from(src_buffer, stream=stream) - # Assuming cuMemcpyAsync is correctly called, we can't directly check the result here + device.sync() + + if check: + dst_ptr = ctypes.cast(dst_buffer.handle, ctypes.POINTER(ctypes.c_byte)) + + for i in range(10): + assert dst_ptr[i] == src_ptr[i] + dummy_mr.deallocate(src_buffer.handle, src_buffer.size) dummy_mr.deallocate(dst_buffer.handle, dst_buffer.size) -def test_buffer_close(): - dummy_mr = DummyMemoryResource() +def test_buffer_copy_from(): + device = Device() + device.set_current() + buffer_copy_from(DummyDeviceMemoryResource(device), device) + buffer_copy_from(DummyUnifiedMemoryResource(device), device) + buffer_copy_from(DummyPinnedMemoryResource(device), device, check = True) + +def buffer_close(dummy_mr : MemoryResource): buffer = dummy_mr.allocate(size=1024) buffer.close() assert buffer.handle == 0 assert buffer.memory_resource == None + +def test_buffer_close(): + device = Device() + device.set_current() + buffer_close(DummyDeviceMemoryResource(device)) + buffer_close(DummyHostMemoryResource()) + buffer_close(DummyUnifiedMemoryResource(device)) + buffer_close(DummyPinnedMemoryResource(device)) + +test_buffer_copy_to() \ No newline at end of file diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index a7a124b..3c8e81d 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -2,6 +2,11 @@ from cuda import nvrtc from cuda.core.experimental._program import Program from cuda.core.experimental._module import ObjectCode, Kernel +from cuda.core.experimental._device import Device + +@pytest.fixture(scope='module') +def init_cuda(): + Device().set_current() def test_program_init_valid_code_type(): code = "extern \"C\" __global__ void my_kernel() {}" @@ -48,3 +53,6 @@ def test_program_close(): program = Program(code, "c++") program.close() assert program.handle is None + +Device().set_current() +test_program_compile_valid_target_type() \ No newline at end of file