Threadgroup atomics require all-atomic operation #217

maleadt · 2023-06-26T10:30:42Z

MWE:

using Metal

function local_kernel(a, expected::AbstractArray{T}, desired::T) where T
    i = thread_position_in_grid_1d()
    b = MtlThreadGroupArray(T, 16)

    #b[i] = a[i]
    Metal.atomic_store_explicit(pointer(b, i), Metal.atomic_load_explicit(pointer(a, i)))

    while Metal.atomic_compare_exchange_weak_explicit(pointer(b, i), expected[i], desired) != expected[i]
        # keep on trying
    end

    #a[i] = b[i]
    Metal.atomic_store_explicit(pointer(a, i), Metal.atomic_load_explicit(pointer(b, i)))

    return
end

function main(; T=Int32, n=16)
    a = Metal.zeros(T, n)
    expected = copy(a)
    desired = T(42)
    @metal threads=n local_kernel(a, expected, desired)
    Array(a)
end

Note how the load and stores that initialize the threadgroup memory and copy it back to global memory need to be atomics for this example to work, even though every thread has its own dedicated memory address to act upon. Demoting those operations to regular array operations results in the final array containing all zeros.

This smells like an upstream bug, especially because the above pattern is impossible to replicate in Metal C (where atomic_int is used as element type, promoting all operations to atomic):

#include <metal_stdlib>
using namespace metal;

kernel void local_kernel(device atomic_int* a [[ buffer(0) ]],
                         device int* expected [[ buffer(1) ]],
                         device int* desired [[ buffer(2) ]],
                         uint i [[ thread_position_in_grid ]]) {
    threadgroup atomic_int b[16];
    atomic_store_explicit(&b[i], atomic_load_explicit(&a[i], memory_order_relaxed), memory_order_relaxed);

    int expectedValue = expected[i];
    while (!atomic_compare_exchange_weak_explicit(&b[i], &expectedValue, desired[i], memory_order_relaxed, memory_order_relaxed)) {
        // keep on trying
    }
    atomic_store_explicit(&a[i], atomic_load_explicit(&b[i], memory_order_relaxed), memory_order_relaxed);
}

#import <Metal/Metal.h>

int main() {
    id<MTLDevice> device = MTLCreateSystemDefaultDevice();

    NSError *error = nil;
    id<MTLLibrary> library = [device newLibraryWithFile:@"atomic_xchg.metallib" error:&error];
    id<MTLFunction> function = [library newFunctionWithName:@"local_kernel"];

    id<MTLCommandQueue> commandQueue = [device newCommandQueue];
    id<MTLCommandBuffer> commandBuffer = [commandQueue commandBuffer];

    id<MTLComputePipelineState> pipelineState = [device newComputePipelineStateWithFunction:function error:&error];

    int n = 16;
    id<MTLBuffer> a = [device newBufferWithLength:n*sizeof(int) options:MTLResourceOptionCPUCacheModeDefault];
    id<MTLBuffer> expected = [device newBufferWithBytesNoCopy:malloc(n*sizeof(int)) length:n*sizeof(int) options:0 deallocator:nil];
    int desiredValues[n];
    for (int i = 0; i < n; i++) {
        desiredValues[i] = 42;
    }
    id<MTLBuffer> desired = [device newBufferWithBytes:&desiredValues length:n*sizeof(int) options:MTLResourceOptionCPUCacheModeDefault];

    id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
    [encoder setComputePipelineState:pipelineState];
    [encoder setBuffer:a offset:0 atIndex:0];
    [encoder setBuffer:expected offset:0 atIndex:1];
    [encoder setBuffer:desired offset:0 atIndex:2];

    MTLSize threadsPerGroup = MTLSizeMake(n, 1, 1);
    MTLSize numThreadgroups = MTLSizeMake(1, 1, 1);
    [encoder dispatchThreadgroups:numThreadgroups threadsPerThreadgroup:threadsPerGroup];
    [encoder endEncoding];

    [commandBuffer commit];
    [commandBuffer waitUntilCompleted];

    int *result = a.contents;
    for (int i = 0; i < n; i++) {
        NSLog(@"%d\n", result[i]);
    }

    return 0;
}

The text was updated successfully, but these errors were encountered:

maleadt · 2023-06-26T10:42:58Z

Hmm, this seems to apply to other atomics as well:

using Metal

function local_kernel(f, a, val::T) where T
    i = thread_position_in_grid_1d()
    b = MtlThreadGroupArray(T, 128)
    #b[i] = a[i]
    val = Metal.atomic_load_explicit(pointer(a, i))
    Metal.atomic_store_explicit(pointer(b, i), val)
    f(pointer(b, i), val)
    #a[i] = b[i]
    val = Metal.atomic_load_explicit(pointer(b, i))
    Metal.atomic_store_explicit(pointer(a, i), val)
    return
end

function main(; T=Int32, n=16)
    a = ones(T, n)
    b = MtlArray(a)
    val = one(T)
    @metal threads=n local_kernel(Metal.atomic_fetch_add_explicit, b, val)
    @show .+(a, val)
    @show Array(b)
    return
end

vchuravy · 2023-06-26T12:10:08Z

Note how the load and stores that initialize the threadgroup memory and copy it back to global memory need to be atomics for this example to work, even though every thread has its own dedicated memory address to act upon. Demoting those operations to regular array operations results in the final array containing all zeros.

I think this is a general truth (and why we don't have atomics for arrays yet)

If you mix atomic operations with non-atomic operations you will get issues.

But I would have expected that the load and stores to thread-local so b[I] would have been able to not be atomic. Just the loads and stores to global memory.

maleadt · 2023-06-26T12:45:43Z

If you mix atomic operations with non-atomic operations you will get issues.

Why is that? Every thread is accessing its own memory locations, so why would mixing atomics with regular loads and stores not work? Note that removing atomics altogether works fine here.

maleadt added the bug label Jun 26, 2023

maleadt changed the title ~~Atomic compare-exchange with threadgroup memory requires additional atomics~~ Threadgroup atomics require all-atomic operation Jun 26, 2023

maleadt added the kernels Things about kernels and how they are compiled. label Feb 28, 2024

tgymnich removed the bug label Oct 18, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Threadgroup atomics require all-atomic operation #217

Threadgroup atomics require all-atomic operation #217

maleadt commented Jun 26, 2023

maleadt commented Jun 26, 2023

vchuravy commented Jun 26, 2023

maleadt commented Jun 26, 2023 •

edited

Loading

Threadgroup atomics require all-atomic operation #217

Threadgroup atomics require all-atomic operation #217

Comments

maleadt commented Jun 26, 2023

maleadt commented Jun 26, 2023

vchuravy commented Jun 26, 2023

maleadt commented Jun 26, 2023 • edited Loading

maleadt commented Jun 26, 2023 •

edited

Loading