<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/128408>128408</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            AMDGPU backend doesn't lower __builtin_nontemporal_load correctly
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          Epliz
      </td>
    </tr>
</table>

<pre>
    Hi,

It looks like the builtin (mentioned at https://gpuopen.com/learn/amd-lab-notes/amd-lab-notes-finite-difference-docs-laplacian_part3/) is not lowered correctly for float types, as well as vectors or floats.

Example HIP kernel:
```

#define FULL_MASK32 0xffffffff
#define FULL_MASK64 0xffffffffffffffff

#ifdef __CUDA_ARCH__
#define __xx_shfl_down(mask, val, offset) __shfl_down_sync(mask, val, offset)
#elif defined(__HIP_PLATFORM_AMD__) // AMD
#define __xx_shfl_down(mask, val, offset) __shfl_down(val, offset)
#else
#error "Unsupported compiler"
#endif

__device__ float warpReduce(float val) {
  if (warpSize == 32) {
    for (int offset = 16; offset > 0; offset /= 2)
      val += __xx_shfl_down(FULL_MASK32, val, offset);
  }
  if (warpSize == 64) {
    for (int offset = 32; offset > 0; offset /= 2)
      val += __xx_shfl_down(FULL_MASK64, val, offset);

  }
 return val;
}

template <typename T>
static inline const T* __device__ addr(const T* p, unsigned index) {
  // helps the AMDGPU compiler understand it can use the sgrp pair + single vgpr addressing mode
  unsigned byte_offset = sizeof(T) * index;
  const uint8_t* p8 = (const uint8_t*)p;
 return (const T*) (p8 + byte_offset);
}

#define DIV_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
#define ALIGN_UP(a, b) (DIV_ROUND_UP((a), (b)) * (b))

#define THREADS_PER_BLOCK 256
#define WARPS_PER_BLOCK 4

#define FLOAT8_ELEMENTS_PER_THREAD 8
#define FLOAT8_ELEMENTS_PER_BLOCK ((THREADS_PER_BLOCK) * (FLOAT8_ELEMENTS_PER_THREAD))

float4 load_nontemporal_float4(const float* p) {
  float x = __builtin_nontemporal_load(p);
  float y = __builtin_nontemporal_load(p + 1);
  float z = __builtin_nontemporal_load(p + 2);
  float w = __builtin_nontemporal_load(p + 3);

  float4 v = make_float4(x, y, z, w);
}

__global__ void float8_nt_bandwidth_kernel(
  const float* __restrict__ A,
  bool* __restrict__ out_flag,
  unsigned N
) {
  int warpCounts = THREADS_PER_BLOCK / warpSize;
  int warpId = threadIdx.x / warpSize;
  int laneId = threadIdx.x % warpSize;
  int tid = blockIdx.x * FLOAT8_ELEMENTS_PER_BLOCK + (4 * threadIdx.x);

  __shared__ float shared_acc;
  if (threadIdx.x == 0) {
    shared_acc = 0.f;
  }
  if (THREADS_PER_BLOCK > warpSize) {
    __syncthreads();
  }


  float r = 0.f;
  if ((blockIdx.x + 1) * FLOAT8_ELEMENTS_PER_BLOCK <= N) {
 unsigned off = tid;
    float4 v0 = load_nontemporal_float4(addr(A, off));
 off += 4 * THREADS_PER_BLOCK;
    float4 v1 = load_nontemporal_float4(addr(A, off));

    float4 v01 = v0 + v1;

    r = (v01.x+ v01.y) + (v01.z + v01.w);
  }

  r = warpReduce(r);

  if (laneId == 0) {
    atomicAdd(&shared_acc, r);
  }

  if (THREADS_PER_BLOCK > warpSize) {
    __syncthreads();
  }

  if (threadIdx.x == 0) {
    *out_flag = (shared_acc > 0.f);
  }
}

void float8_nt_bandwidth(
  const void* __restrict__ A,
  bool* __restrict__ out_flag,
  unsigned N
) {
  const int threads_per_blocks = THREADS_PER_BLOCK;
  const int num_blocks = DIV_ROUND_UP(N, FLOAT8_ELEMENTS_PER_BLOCK);
  float8_nt_bandwidth_kernel<<<num_blocks, threads_per_blocks>>>((const float*)A, (bool*)out_flag, N);
}
```

HIP version 6.3.2
AMD clang version 18.0

Best regards,
Epliz
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJy0WEtv6joe_zRmY4ESB2hYsAiv2-q2PVVPO7O0TOyAp8aObAdoP_3IdgJJoZ1qrk4Vkcb-_d8vJ8QYvpGMTcFoBkaLHqnsVunpshT8o7dW9H16ywGagygDUXZnoVDqzUDB3xi0WwbXFReWSwhQumPSciUZhcTCrbWlAUkG0Aqg1aasVMnkIFc7gFaCES0BWpEd7Quy7ktlmfn83C-45Jb1KS8KppnMWZ-q3PQFKQXJOZG4JNomnv8EcgOlcsodmGYU5kprllvxDgulYSEUsdC-l07KHBIDD0wId9-z3CptYAMyg2Dn8kh2pWDw9u4JvjEtmXCmRBkYR_XlYQAllBVcMrh6vb_HD9nvvxMEo2NR_12HjIctSBvq0bygrIAYz18XGc6e57cYd9hgfDxisy0EpuognduJeXNm7YlwN1UUhlnnE3yGYfMu82-wQQITvIBBDAUoxfj27gk_3Wcvq1_PDzh7WGDs-IaQwuxh8c8VAyj9ShfD6v-0VhoChF6lqcpSaesDvCu5YBogVKMk5bUTMaZsz3OGcR35A9HlM6NVzgBKw5IXOoHgZgaiDEJeuAR2uN_8g0GQLECygAlqY6DPJYBSLm2trQPCeAyS2XlhCaP2M1o5DAp2Qf-3JwICNHPrFz5r5dHVOCVBF3Cz-Ebv8fAneifoT-jtZH-ld1t1zWylpQeGPb8MosyyXSmIdcbMXclKsmPwBSRLEGXGEstzyKVwCZcraSx8ASiDrZATSjVAaWuzdKpU0rc5Crmk7Nj2T53OWyZK41ta9rD46-n1lGKwkpRpY4mkkFuYEwkrE5qf2egSloQ7986g4XIjGNxvSu21YMatwJ2izAs6qbB-twy3ImH4B1MFQOlLKK-sVrIOdjCl4tKm2HqDUk92svK8BdCkrMlqB3dcEdinjh7N2mqcQ9SE4VzXi7t_4edfr48L_PoEUEqcN9c1p3CR8DRzK36nD-O6UTRLp7qumWb3d389XmH4SVjN3CHOjLyHunzbrF9un5fZ4jd-Wj7j2f2v-d8QjcYdxL-z56f2_vCim9__yl5SvLxfPiwfXwI0sIXp_wQGnkH5C11a6n8tpG2Xb1dDKBShWCrpqkNpInBYP0XXP9bJfk7t0OuOMJRsPao7bBxflxGt3hKI3n9A5GMeX5B-_JQUXZAefkqadJtK7aW9J9-RN3b2z9Elz7v7-XA_h8tUx3gj1JoIjOFecRqYpVhavCaSHji1W1wfAlDaqsiTzzHWzFjNc4sxzMJhCcK1UuJiV1UWF4JsGtCpJzz6vGpNJBnm1lxV0hpv12ViuwJrmn_jyIbwjnoiu9WM0Dt6HBy_hAsi2VX46Crc8oBdC5W_NdDs21rwrWHoYS0J3Ri6UwHRjJ7Gdv1I8vwk3Y-7jo5h4kXdgXem9IpGg-L64Lzi0WR5trnDE_szVJBtfHV_msadXIT6s-Qg0bWttttmTaf81n_J3DF7bCl0yhtVFCFwnDaSzuUQ-a2ve0c9K7N6Vtd9J7DxjMO0D4G77GUX8uL_X95nzQMrZwGawX3cAelm_O2j2GXRzOEH760p5J4_YLNzuBarhk3ncKi7-oSYncvjSqoRq3Y8zyj1wR23chbNob4u-M8k38_LA6Cs6UONJzsVs_R5eyGiEfRVk-x2R4f6U80xSPC9KLgEl0xjX1hftMruWcpRymrXpvh08Hh04fuyIj_PrevDwlWtu86SHNNLjd3ZNly-P3SHC0CTrDn-BJ8BNGk5yneFzkDrvqK6N9g904YrCceDZOBel7KHBcwFkZvTTpwOavyMGQs12xDtEs3FIXwJoNOETpIJ6bFpfDOMkigex3FvOyVRkuQ0T4okWsc0z_MbRmJK6TiaDKOomPT4FEVoFCGURJNoPIoH6CZNxuO0iEcxSdloBIYR2xEuBkLsdwOlNz1uTMWmMUqHUdoTZM2E8V8nEJLsAP2ue-8bLXp66oj662pjwDAS3FhzZmO5FWxan-bXJH9jkkKqmJEA3dTfCr45a5w_IvQqLaafPmhwu63WzfcMsW9u_VKr_7DcArTyehqAVrUh-yn6bwAAAP__36grlg">