Hi there!
Let’s say that I want to increment a property var incremental:Int32
every time a kernel thread is executed:
//SWIFT
var incremental:Int32 = 0
var incrementalBuffer:MTLBuffer!
var incrementalPointer: UnsafeMutablePointer<Int32>!
init(metalView: MTKView) {
...
incrementalBuffer = Renderer.device.makeBuffer(bytes: &incremental, length: MemoryLayout<Int32>.stride)
incrementalPointer = incrementalBuffer.contents().bindMemory(to: Int32.self, capacity: 1)
}
func draw(in view: MTKView) {
...
computeCommandEncoder.setComputePipelineState(computePipelineState)
let width = computePipelineState.threadExecutionWidth
let threadsPerGroup = MTLSizeMake(width, 1, 1)
let threadsPerGrid = MTLSizeMake(10, 1, 1)
computeCommandEncoder.setBuffer(incrementalBuffer, offset: 0, index: 0)
computeCommandEncoder.dispatchThreads(threadsPerGrid, threadsPerThreadgroup: threadsPerGroup)
computeCommandEncoder.endEncoding()
commandBufferCompute.commit()
commandBufferCompute.waitUntilCompleted()
print(incrementalPointer.pointee)
}
//METAL
kernel void compute_shader (device int& incremental [[buffer(0)]]){
incremental++;
}
So I expect outputs:
10
20
30
...
but I get:
1
2
3
...
As you suggest @caroline
kernel void compute_shader (device atomic_int& incremental [[buffer(0)]]){
atomic_fetch_add_explicit(&incremental, 1, memory_order_relaxed);
}
Solves the issue!! But if I try to optimise it:
[[kernel]] void compute_shader (device atomic_int& incremental [[buffer(0)]],
ushort lid [[thread_position_in_threadgroup]] ){
threadgroup atomic_int local_atomic {0};
atomic_fetch_add_explicit(&local_atomic, 1, memory_order_relaxed);
threadgroup_barrier(mem_flags::mem_threadgroup);
if(lid == 0) {
int local_non_atomic = atomic_load_explicit(&local_atomic, memory_order_relaxed);
atomic_fetch_add_explicit(&incremental, local_non_atomic, memory_order_relaxed);
}
}
Expect:
10
20
30
...
Get:
1125974026
1125974036
-2000908258
-832823256 ...
I fear this is above my pay grade, and I don’t see how you can be expecting 10, 20, 30
from the result.
However if you replace
threadgroup atomic_int local_atomic {0};
with:
threadgroup atomic_int local_atomic;
atomic_store_explicit(&local_atomic, 0, metal::memory_order_relaxed);
then you get 1, 2, 3...
Thank you very much @caroline,
The issue was initialise local threadgroup var when declared as you pointed.
In this way works fine:
threadgroup atomic_int local_atomic;
if (lid == 0) atomic_store_explicit(&local_atomic, 0, memory_order_relaxed);
I’m adding 1 for each thread in the threadgroup to the local variable and when all threads of the threadgroup are done (barrier) add the local to the increment.
1 Like