<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/128408>128408</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
AMDGPU backend doesn't lower __builtin_nontemporal_load correctly
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
Epliz
</td>
</tr>
</table>
<pre>
Hi,
It looks like the builtin (mentioned at https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-finite-difference-docs-laplacian_part3/) is not lowered correctly for float types, as well as vectors or floats.
Example HIP kernel:
```
#define FULL_MASK32 0xffffffff
#define FULL_MASK64 0xffffffffffffffff
#ifdef __CUDA_ARCH__
#define __xx_shfl_down(mask, val, offset) __shfl_down_sync(mask, val, offset)
#elif defined(__HIP_PLATFORM_AMD__) // AMD
#define __xx_shfl_down(mask, val, offset) __shfl_down(val, offset)
#else
#error "Unsupported compiler"
#endif
__device__ float warpReduce(float val) {
if (warpSize == 32) {
for (int offset = 16; offset > 0; offset /= 2)
val += __xx_shfl_down(FULL_MASK32, val, offset);
}
if (warpSize == 64) {
for (int offset = 32; offset > 0; offset /= 2)
val += __xx_shfl_down(FULL_MASK64, val, offset);
}
return val;
}
template <typename T>
static inline const T* __device__ addr(const T* p, unsigned index) {
// helps the AMDGPU compiler understand it can use the sgrp pair + single vgpr addressing mode
unsigned byte_offset = sizeof(T) * index;
const uint8_t* p8 = (const uint8_t*)p;
return (const T*) (p8 + byte_offset);
}
#define DIV_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
#define ALIGN_UP(a, b) (DIV_ROUND_UP((a), (b)) * (b))
#define THREADS_PER_BLOCK 256
#define WARPS_PER_BLOCK 4
#define FLOAT8_ELEMENTS_PER_THREAD 8
#define FLOAT8_ELEMENTS_PER_BLOCK ((THREADS_PER_BLOCK) * (FLOAT8_ELEMENTS_PER_THREAD))
float4 load_nontemporal_float4(const float* p) {
float x = __builtin_nontemporal_load(p);
float y = __builtin_nontemporal_load(p + 1);
float z = __builtin_nontemporal_load(p + 2);
float w = __builtin_nontemporal_load(p + 3);
float4 v = make_float4(x, y, z, w);
}
__global__ void float8_nt_bandwidth_kernel(
const float* __restrict__ A,
bool* __restrict__ out_flag,
unsigned N
) {
int warpCounts = THREADS_PER_BLOCK / warpSize;
int warpId = threadIdx.x / warpSize;
int laneId = threadIdx.x % warpSize;
int tid = blockIdx.x * FLOAT8_ELEMENTS_PER_BLOCK + (4 * threadIdx.x);
__shared__ float shared_acc;
if (threadIdx.x == 0) {
shared_acc = 0.f;
}
if (THREADS_PER_BLOCK > warpSize) {
__syncthreads();
}
float r = 0.f;
if ((blockIdx.x + 1) * FLOAT8_ELEMENTS_PER_BLOCK <= N) {
unsigned off = tid;
float4 v0 = load_nontemporal_float4(addr(A, off));
off += 4 * THREADS_PER_BLOCK;
float4 v1 = load_nontemporal_float4(addr(A, off));
float4 v01 = v0 + v1;
r = (v01.x+ v01.y) + (v01.z + v01.w);
}
r = warpReduce(r);
if (laneId == 0) {
atomicAdd(&shared_acc, r);
}
if (THREADS_PER_BLOCK > warpSize) {
__syncthreads();
}
if (threadIdx.x == 0) {
*out_flag = (shared_acc > 0.f);
}
}
void float8_nt_bandwidth(
const void* __restrict__ A,
bool* __restrict__ out_flag,
unsigned N
) {
const int threads_per_blocks = THREADS_PER_BLOCK;
const int num_blocks = DIV_ROUND_UP(N, FLOAT8_ELEMENTS_PER_BLOCK);
float8_nt_bandwidth_kernel<<<num_blocks, threads_per_blocks>>>((const float*)A, (bool*)out_flag, N);
}
```
HIP version 6.3.2
AMD clang version 18.0
Best regards,
Epliz
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJy0WEtv6joe_zRmY4ESB2hYsAiv2-q2PVVPO7O0TOyAp8aObAdoP_3IdgJJoZ1qrk4Vkcb-_d8vJ8QYvpGMTcFoBkaLHqnsVunpshT8o7dW9H16ywGagygDUXZnoVDqzUDB3xi0WwbXFReWSwhQumPSciUZhcTCrbWlAUkG0Aqg1aasVMnkIFc7gFaCES0BWpEd7Quy7ktlmfn83C-45Jb1KS8KppnMWZ-q3PQFKQXJOZG4JNomnv8EcgOlcsodmGYU5kprllvxDgulYSEUsdC-l07KHBIDD0wId9-z3CptYAMyg2Dn8kh2pWDw9u4JvjEtmXCmRBkYR_XlYQAllBVcMrh6vb_HD9nvvxMEo2NR_12HjIctSBvq0bygrIAYz18XGc6e57cYd9hgfDxisy0EpuognduJeXNm7YlwN1UUhlnnE3yGYfMu82-wQQITvIBBDAUoxfj27gk_3Wcvq1_PDzh7WGDs-IaQwuxh8c8VAyj9ShfD6v-0VhoChF6lqcpSaesDvCu5YBogVKMk5bUTMaZsz3OGcR35A9HlM6NVzgBKw5IXOoHgZgaiDEJeuAR2uN_8g0GQLECygAlqY6DPJYBSLm2trQPCeAyS2XlhCaP2M1o5DAp2Qf-3JwICNHPrFz5r5dHVOCVBF3Cz-Ebv8fAneifoT-jtZH-ld1t1zWylpQeGPb8MosyyXSmIdcbMXclKsmPwBSRLEGXGEstzyKVwCZcraSx8ASiDrZATSjVAaWuzdKpU0rc5Crmk7Nj2T53OWyZK41ta9rD46-n1lGKwkpRpY4mkkFuYEwkrE5qf2egSloQ7986g4XIjGNxvSu21YMatwJ2izAs6qbB-twy3ImH4B1MFQOlLKK-sVrIOdjCl4tKm2HqDUk92svK8BdCkrMlqB3dcEdinjh7N2mqcQ9SE4VzXi7t_4edfr48L_PoEUEqcN9c1p3CR8DRzK36nD-O6UTRLp7qumWb3d389XmH4SVjN3CHOjLyHunzbrF9un5fZ4jd-Wj7j2f2v-d8QjcYdxL-z56f2_vCim9__yl5SvLxfPiwfXwI0sIXp_wQGnkH5C11a6n8tpG2Xb1dDKBShWCrpqkNpInBYP0XXP9bJfk7t0OuOMJRsPao7bBxflxGt3hKI3n9A5GMeX5B-_JQUXZAefkqadJtK7aW9J9-RN3b2z9Elz7v7-XA_h8tUx3gj1JoIjOFecRqYpVhavCaSHji1W1wfAlDaqsiTzzHWzFjNc4sxzMJhCcK1UuJiV1UWF4JsGtCpJzz6vGpNJBnm1lxV0hpv12ViuwJrmn_jyIbwjnoiu9WM0Dt6HBy_hAsi2VX46Crc8oBdC5W_NdDs21rwrWHoYS0J3Ri6UwHRjJ7Gdv1I8vwk3Y-7jo5h4kXdgXem9IpGg-L64Lzi0WR5trnDE_szVJBtfHV_msadXIT6s-Qg0bWttttmTaf81n_J3DF7bCl0yhtVFCFwnDaSzuUQ-a2ve0c9K7N6Vtd9J7DxjMO0D4G77GUX8uL_X95nzQMrZwGawX3cAelm_O2j2GXRzOEH760p5J4_YLNzuBarhk3ncKi7-oSYncvjSqoRq3Y8zyj1wR23chbNob4u-M8k38_LA6Cs6UONJzsVs_R5eyGiEfRVk-x2R4f6U80xSPC9KLgEl0xjX1hftMruWcpRymrXpvh08Hh04fuyIj_PrevDwlWtu86SHNNLjd3ZNly-P3SHC0CTrDn-BJ8BNGk5yneFzkDrvqK6N9g904YrCceDZOBel7KHBcwFkZvTTpwOavyMGQs12xDtEs3FIXwJoNOETpIJ6bFpfDOMkigex3FvOyVRkuQ0T4okWsc0z_MbRmJK6TiaDKOomPT4FEVoFCGURJNoPIoH6CZNxuO0iEcxSdloBIYR2xEuBkLsdwOlNz1uTMWmMUqHUdoTZM2E8V8nEJLsAP2ue-8bLXp66oj662pjwDAS3FhzZmO5FWxan-bXJH9jkkKqmJEA3dTfCr45a5w_IvQqLaafPmhwu63WzfcMsW9u_VKr_7DcArTyehqAVrUh-yn6bwAAAP__36grlg">