<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/62477>62477</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            AMDGPU: weird miscompilation when calling function calling __ballot HIP function
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          Epliz
      </td>
    </tr>
</table>

<pre>
    Hi,

I am facing some weird mis-compilation for the following program:

```

#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>

#include <cstdint>

#if __AMDGCN_WAVEFRONT_SIZE == 32
typedef uint32_t hip_warp_ext_warpmask_t;
#elif __AMDGCN_WAVEFRONT_SIZE == 64
typedef uint64_t hip_warp_ext_warpmask_t;
#else
# error "Unsupported wavefront size"
#endif

// Active threads
static inline __device__ hip_warp_ext_warpmask_t __hip_warp_ext_activemask() {
  return __ballot(1);
}

// Warp vote functions
static inline __device__ int __hip_warp_ext_all(int predicate) {
  return __all(predicate);
}

static inline __device__ int __hip_warp_ext_any(int predicate) {
  return __any(predicate);
}

static inline __device__ hip_warp_ext_warpmask_t __hip_warp_ext_ballot(int predicate) {
  return __ballot(predicate);
}

static inline __device__ hip_warp_ext_warpmask_t __hip_warp_ext_match_any(int value) {
  bool active = true;
  hip_warp_ext_warpmask_t result = 0;

  while (active) {
    // determine what threads have the same value as the currently first active thread
    int first_active_value = __builtin_amdgcn_readfirstlane(value);
    int predicate = (value == first_active_value);
    hip_warp_ext_warpmask_t m = __hip_warp_ext_ballot(predicate); // THIS LINE IS PROBLEMATIC

    // if the current thread has the same value, set its result mask to the current one
    if (predicate) {
      result |= m;
      active = false;
    }
  }

  return result;
}


#include <cstdlib>

static void __assert_true(bool cond, const char* message, const char* file, int line) {
  if (!cond) {
    printf("Assertion failed in %s:%d with message %s\n", file, line, message);
 exit(-1);
  }
}

#define ASSERT_TRUE(cond, message) __assert_true((cond), (message), __FILE__, __LINE__);
#define ASSERT_FALSE(cond, message) __assert_true(!(cond), (message), __FILE__, __LINE__);
#define ASSERT_EQUAL(a, b, message) __assert_true(((a) == (b)), (message), __FILE__, __LINE__);
#define ASSERT_NOT_EQUAL(a, b, message) __assert_true(((a) != (b)), (message), __FILE__, __LINE__);

__global__
void test_warp_vote_functions(hip_warp_ext_warpmask_t* results) {
  results[2] = __hip_warp_ext_ballot(1);
  // will always see one bit set, can be any bit due to race condition
  results[5] = __hip_warp_ext_match_any(threadIdx.x);

  if (threadIdx.x < 16) {
 results[8] = __hip_warp_ext_activemask();
    results[9] = __hip_warp_ext_ballot(1);
  }
}

int main(int argc, char** argv) {
  
  hipSetDevice(0);

  hipDeviceProp_t props;
 hipGetDeviceProperties(&props, 0);

  bool hasWarpVote = props.arch.hasWarpVote != 0;
  printf("Has warpVote: %d\n", props.arch.hasWarpVote);

  size_t num_results = 16;
  size_t results_size = sizeof(hip_warp_ext_warpmask_t) * num_results;

 hip_warp_ext_warpmask_t* results = new hip_warp_ext_warpmask_t[num_results];
  hip_warp_ext_warpmask_t* results_device;

  if (hipMalloc((void**) &results_device, results_size) != hipSuccess) {
    ASSERT_TRUE(false, "HIP allocation failed");
  }

  unsigned num_blocks = 1;
  unsigned threads_per_block = 64;

  dim3 block_dim(threads_per_block);
  dim3 grid_dim(num_blocks);

  test_warp_vote_functions<<<grid_dim, block_dim, 0, 0>>>(results_device);

  if (hipMemcpyDtoH(results, results_device, results_size) != hipSuccess) {
    ASSERT_TRUE(false, "HIP copy failed");
  }

  size_t expected_warp_size = 64;
 size_t full_warp_mask = expected_warp_size == 64 ? 0xFFFFFFFFFFFFFFFFull : 0xFFFFFFFFull;
  size_t half_warp_mask_a = expected_warp_size == 64 ? 0xAAAAAAAAAAAAAAAAull : 0xAAAAAAAAull;
  size_t half_warp_mask_b = expected_warp_size == 64 ? 0x5555555555555555ull : 0x55555555ull;

  if (hasWarpVote) {
    ASSERT_EQUAL(results[2], full_warp_mask, "ballot");

    ASSERT_TRUE(results[5] != 0 && (results[5] & results[5] - 1) == 0, "match_any_none"); // should have one bit set

 // partially active
    printf("mask: %lu\n", results[8]);
 ASSERT_EQUAL(results[8], 0xFFFFull, "partial activemask");
 printf("ballot: %lu\n", results[9]);
    ASSERT_EQUAL(results[9], 0xFFFFull, "partial ballot");
  }

  printf("All tests passed\n");
  return 0;
}
```
The assertion `ASSERT_EQUAL(results[9], 0xFFFFull, "partial ballot");` gets triggered.
I should get the same value in `results[8]` and `results[9]` as it is basically the same expression being computed if we inline the different calls.

What is really weird is that if I change `__hip_warp_ext_match_any` to
```

static inline __device__ hip_warp_ext_warpmask_t __hip_warp_ext_match_any(int value) {
  bool active = true;
  hip_warp_ext_warpmask_t result = 0;

  while (active) {
    // determine what threads have the same value as the currently first active thread
    int first_active_value = __builtin_amdgcn_readfirstlane(value);
    int predicate = (value == first_active_value);
    hip_warp_ext_warpmask_t m = __ballot(predicate); // CHANGED THIS LINE TO DIRECTLY call __ballot

    // if the current thread has the same value, set its result mask to the current one
    if (predicate) {
      result |= m;
      active = false;
    }
 }

  return result;
}
```

then it works. However it doesn't inspire confidence to say the least, as it should give the exact same result.

Maybe the "ockl ballot hoisting hack" that I see in the generated assembly of the non-reduced case is at fault?

Best regards,
Epliz
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsWVtz4joS_jXOS1dSIA8OPPBAuEyoylx2kjmndl9cwm5j7ciyS5JDOL9-S5KNZQcmOXVm92F3UxQBq9X9qS-fWoIqxfYCcR5M7oLJ6orWOi_lfF1x9sfVrkyP83sWkGUwWgWjhXvfAi0gowkTe1BlgXBAJlMomLpOyqJinGpWCshKCTpHyErOy4MRrmS5l7QIwoWvLohGzct_SEImEl6nCEG4zFkVkE3OqljWQrMCb_IgXL9LMqYV86XPzEmUTpnQZ2QyiOPFp9XH5ef498Vv6823L5-f4sftP9YQhKsgXEFInLA-VphiBjUTOiSxBoPgQGUV44u2HwqqfsQ6CO9O2pG_rT_68Fp_9OG9-hWevgBKWUoICPkuVF1VpdSYwoE-YyZLoUGxPzAgpJssUpb13bEJyAYWiWbPCDqXSFPlhpSmmiXABGcCIY5TfGYJxvElkBDHvRFqdZqxgEwDMoPgtlkFgERdSwFxvKOclzog03FAZt0yb1dnQP5OZQXPpUbIapGYZHwLKROvUXEekKkZqCSmLKEaL2Fzor7YJYB_CoA4vhuAFf1LAN4Zq1MY3oXrJP2fgFZQneSe154pr4fIdmXJweWbKTHQssYTHrhoSaKqubYzRh38ZtIhZxwhIFOnd2ARoEnKFDXKwiztkFPdVhDk1JYTgqIFOsxAlX2S1FKi0PwIGZNKt7jdzE6_WasVaAopdkoM2Dje1YxrJmJapPtExGamleVUYECmrY88HziFp4BZPa1kS0uvzQ11XPJk0eA6n1SDNGld93S_fYSH7ec1bB_h67cvdw_rT4un7bIfh5OnWea7r_EX5I1XOz8HZAkKNTCt2ggblKDL3vxSoOebDAY4-8GGU67cLs1Ki55XwM-9jBqC9odPBeF9HNSU036R_y7sbpztBrtbU2zPJUsNfyiFUse2GsjUFklSitQ4KCmF0pDkVAZkAQUqRff4eiBj3D41uWPqd-AY57aAjJ3agdMqyYTOrABZWCi2f6CMYwpMQEAmyjQNZJLCgem8heEGJkthdi6yPIFwAJYdWi838YWZTLse9zO2c-RwRwlTzEzVLh4f19-e4qdv39cBmbbu6Uy8cuNJamYEAzL14JAlxPFm-7COY_fZZLf5PPN38L7lzeLh8d2mx7_a-vpv3xcPhuTMjN3bS3eis5YxTFpZ078IzucvfwESGf8KSPY9jve83FEex-67LSiNypFebJqQuGtCyPRS30YWTWmrV3upezq5I8Fk9VP2HKa0Y8MD4xwoP9CjAoVo6Ax2TBvms2VMBewQqDjap2mNhv8kTdBSADPAX2OZXMDi78GOd7fpy83La7-1lOAJGbKCcdRff2dyesHksHvsMWo3ffZnvXeBEAzBFZSJpsmgcp9YNzoeNHGkcv88CKLXXzyiXtnGJiDT0Tm_5KxyAl9lWcVmJy4r1QHLWfWxVWEkDFmiskuPnChZwlnFltZzqkx3_FvZ7Ox2yg2VSX7TG3IlMvIc4pP0PVVwaGSDcGF4OPV4-LzSc5jMsSPWIOoibkJlUY0jz3Aj04zH5qsVMh_K7Gc1ZUp94Ssf2n-7Gq0lgYeLopM7X_9k9XY36WlvetwLtZGz6pPJzsRxl6EWl2JuZdFACVn2fOQxnUm7OklQDckFBruaa0ksG5L77Vew1qm3G9sAny-T5kEt7H1Cat2-42XyowmpN-kk0zTBcYXSyUJz6B06JGVFCFYiTllx4g1vZh-Wld9LljbiHZhzWXiRr8Ole3Walj4KW2jmLVw3LzIdBuUS8ZngYpFUx5Uu77t5fhT_fXFNyur47og29YcvFSYaU-eoUxV6wWols5pzJ2XbaSN1frKbD0G4gdHLZvBXcw6GW7qRmvPXtJBTnnXGYvpec4vBX2fOe_Kmud17zU0Gf50578mlVOlx6Nk4t61Qv1uwPXEvGE0OtJseOZefw-QZ7vrN1mAYKCARnJOIhq3CNYy9ZnDUwDh1C7Ewh6wWTtu5qLyseepOyH7X4qNtRCsqNaOcH5sD1vmzhfWA26547e1X_SajVw4X_Ttt_OvS0wTPralBAn5b0q8wH1ETiJ9img0x_SzsszdgnY38maLvHck4txSpoDI9dbfT-xqa8-nozNG0f7_7lCPQ0yEviEa_bC3RCPaoFWjJ9nuUmN44i9s2k_aoh1ctzEIYhDUaARVpf2DWDihgGpiCHVUssSl3UokvlUSlzLp2yMQekrKoam0OsRkcsL3aMvIpyzK01wtGh7rxvf97Tq0FiVa9u11nCrR9nsHWdJvCHH-j0cX-OxqBLs8G4P-3bf9zt21vXbAt7xefP65X3kXb0xdYbb-tl08Pf7cZ2in5r71z-9NXbufKSucoDEEcSvlD3cB9ecBnlOZJWqISAbnVwISqmLTn64ylKBJ75FbUMQlHquzJ3DFNS12sSVx8oYl2DnXAetTxiR53Ti4gpEx-tCQJecmUNpSUU9MrE0cmW3sjwISdsUeBkhq2MvRc7PgRShdVUYpriWmdYAoJVWjIiGrIqPXLxgdwh8oU8J7KVJ1-NnQ_JqbzMJ2FM3qF83E0JbMwjMLxVT5Px-FkgrskQ0rDXUjJeDIaT2fTkCZRgtH4is3JiISjyWg8moXhZHxzi9PxLAojJNEkmo4nwYcRFpTxG86fi5tS7q-YUjXOI_Lh9vaK0x1yZX_fJMQc5eyg2TYmqys5N3Oud_VeBR9GnCmtOi2aaY7zxafVx6_fzR59-p3T_5nzYCJuSsR4tz06nB60ZQOm7W5Hr2rJ57nWlbvVNAW0ZzqvdzdJWQRkYwA0_64rWf4TEx2QjYWtArKxy_pXAAAA__8ZBxdM">