[llvm-dev] Redux sync intrinsics issue

Fri Aug 6 07:50:47 PDT 2021

 Hi all,

redux sync intrinsics are not working as expected.

clang/test/redux-builtins.cu has usage as

out = __nvvm_redux_sync_add(val, 0xFF);

out is the write location for the warp,
val is the thread's contributed value,
and 0xFF is the mask for a fully active warp.

So far all usage of this builtin has resulted in an Illegal instruction.
This is an nvcc application using the nvcc builtin to reduce across a warp:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda_profiler_api.h>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__inline__ __device__
unsigned warpReduceRedux(unsigned val) {
    return __reduce_add_sync(0xFF, val);
}

__global__ void reduceKernel(unsigned *in, unsigned* out, int N) {
    unsigned sum = in[threadIdx.x];
    sum = warpReduceRedux(sum);
    if (threadIdx.x == 0)
        out[0] = sum;
}

int main()
{
    const int SIZE = 32;
    const int ARRAY_BYTES = SIZE * sizeof(unsigned);

    // generate the input array on the host
    unsigned h_in[SIZE];
    unsigned sum = 0.0f;
    for (int i = 0; i < SIZE; i++) {
        h_in[i] = i;
        sum += h_in[i];
    }

    // declare GPU memory pointers
    unsigned * d_in, *d_out;

    // allocate GPU memory
    cudaMalloc((void **)&d_in, ARRAY_BYTES);
    cudaMalloc((void **)&d_out, sizeof(unsigned));

    // transfer the input array to the GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

    // offload to device
    reduceKernel<<<1, SIZE>>>(d_in, d_out, SIZE);

    // copy back the sum from GPU
    unsigned h_out;
    cudaMemcpy(&h_out, d_out, sizeof(unsigned), cudaMemcpyDeviceToHost);
    printf("%u\n", h_out);
}

cuda-memcheck is clear and has verifiable output.
The same application, substituting the nvcc builtin for the clang one then
building with clang:

__inline__ __device__
unsigned warpReduceRedux(unsigned val) {
    return __nvvm_redux_sync_add(val, 0xFF);
}

compiles but does not pass cuda-memcheck and does not provide the correct
output:

========= CUDA-MEMCHECK
========= Illegal Instruction
========= at 0x00000cf0 in reduceKernel(unsigned int*, unsigned int*, int)
========= by thread (0,0,0) in block (0,0,0)

What is the usage for these? I've also attached the PTX emitted by these
apps in case there's a backend issue to be found.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20210806/bc8b118c/attachment.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: clang.ptx
Type: image/ptx
Size: 7038 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20210806/bc8b118c/attachment.bin>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: nvcc.ptx
Type: image/ptx
Size: 944 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20210806/bc8b118c/attachment-0001.bin>