<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/55314>55314</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            wasted opportunities with array indexing patterns in AMDGPU backend
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          Epliz
      </td>
    </tr>
</table>

<pre>
    Hi,

I have observed some suboptimal generated assembly for what seems like pretty common code patterns with Clang 14.0.0 coming from ROCM 5.0.2.

For this example kernel code:
```
__global__
void suboptimal_indexing(int const* a, int const* b, int* c, size_t N, size_t C, size_t offset) {
  size_t globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
  if (globalThreadId < N) {
    for (size_t i = 0; i < C; i++) {
      int r = 0;
      // access 0
      r += a[globalThreadId] + b[globalThreadId];
      globalThreadId += offset;
      // access 1
      r += a[globalThreadId] + b[globalThreadId];
      globalThreadId += offset;
      // access 2
      r += a[globalThreadId] + b[globalThreadId];
      globalThreadId += offset;
      // access 3
      r += a[globalThreadId] + b[globalThreadId];
      globalThreadId += offset;
      // access 4
      r += a[globalThreadId] + b[globalThreadId];
      globalThreadId += offset;
      // access 5
      r += a[globalThreadId] + b[globalThreadId];
      globalThreadId += offset;
      // access 6
      r += a[globalThreadId] + b[globalThreadId];
      globalThreadId += offset;
      // access 7
      r += a[globalThreadId] + b[globalThreadId];
      globalThreadId += offset;
      
      c[globalThreadId] = r;
    }
  }
}
```
The generated assembly is:
```

# __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa-gfx1031
        .text
        .amdgcn_target "amdgcn-amd-amdhsa--gfx1031"
        .protected      _Z19suboptimal_indexingPKiS0_Pimmm ; -- Begin function _Z19suboptimal_indexingPKiS0_Pimmm
        .globl  _Z19suboptimal_indexingPKiS0_Pimmm
        .p2align        8
        .type   _Z19suboptimal_indexingPKiS0_Pimmm,@function
_Z19suboptimal_indexingPKiS0_Pimmm:     ; @_Z19suboptimal_indexingPKiS0_Pimmm
; %bb.0:
        s_load_dword s4, s[4:5], 0x4
        s_clause 0x1
        s_load_dwordx8 s[12:19], s[6:7], 0x0
        s_load_dwordx4 s[0:3], s[6:7], 0x20
        v_mov_b32_e32 v1, 0
        s_waitcnt lgkmcnt(0)
        s_and_b32 s4, s4, 0xffff
        s_mul_i32 s8, s8, s4
        s_cmp_lg_u64 s[0:1], 0
        v_add_nc_u32_e32 v0, s8, v0
        s_cselect_b32 s4, -1, 0
        v_cmp_gt_u64_e32 vcc_lo, s[18:19], v[0:1]
        s_and_b32 s4, vcc_lo, s4
        s_and_saveexec_b32 s5, s4
        s_cbranch_execz BB0_3
; %bb.1:
        v_lshlrev_b64 v[0:1], 2, v[0:1]
        s_lshl_b64 s[4:5], s[2:3], 5
        s_add_u32 s6, s16, s4
        s_addc_u32 s7, s17, s5
        s_lshl_b64 s[2:3], s[2:3], 2
BB0_2:                                  ; =>This Inner Loop Header: Depth=1
        v_add_co_u32 v2, vcc_lo, s12, v0
        v_add_co_ci_u32_e32 v3, vcc_lo, s13, v1, vcc_lo
        v_add_co_u32 v4, vcc_lo, s14, v0
        v_add_co_ci_u32_e32 v5, vcc_lo, s15, v1, vcc_lo
        s_add_u32 s0, s0, -1
        global_load_dword v10, v[2:3], off
        global_load_dword v11, v[4:5], off
        v_add_co_u32 v2, vcc_lo, v2, s2
        v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
        v_add_co_u32 v4, vcc_lo, v4, s2
        v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
        v_add_co_u32 v6, vcc_lo, v2, s2
        v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
        v_add_co_u32 v8, vcc_lo, v4, s2
        v_add_co_ci_u32_e32 v9, vcc_lo, s3, v5, vcc_lo
        global_load_dword v12, v[2:3], off
        global_load_dword v13, v[4:5], off
        global_load_dword v14, v[6:7], off
        global_load_dword v15, v[8:9], off
        v_add_co_u32 v2, vcc_lo, v6, s2
        v_add_co_ci_u32_e32 v3, vcc_lo, s3, v7, vcc_lo
        v_add_co_u32 v4, vcc_lo, v8, s2
        v_add_co_ci_u32_e32 v5, vcc_lo, s3, v9, vcc_lo
        v_add_co_u32 v6, vcc_lo, v2, s2
        v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
        v_add_co_u32 v8, vcc_lo, v4, s2
        v_add_co_ci_u32_e32 v9, vcc_lo, s3, v5, vcc_lo
        global_load_dword v16, v[2:3], off
        global_load_dword v17, v[4:5], off
        global_load_dword v18, v[6:7], off
        global_load_dword v19, v[8:9], off
        v_add_co_u32 v2, vcc_lo, v6, s2
        v_add_co_ci_u32_e32 v3, vcc_lo, s3, v7, vcc_lo
        v_add_co_u32 v4, vcc_lo, v8, s2
        v_add_co_ci_u32_e32 v5, vcc_lo, s3, v9, vcc_lo
        v_add_co_u32 v6, vcc_lo, v2, s2
        v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
        v_add_co_u32 v8, vcc_lo, v4, s2
        v_add_co_ci_u32_e32 v9, vcc_lo, s3, v5, vcc_lo
        global_load_dword v20, v[2:3], off
        global_load_dword v21, v[4:5], off
        global_load_dword v22, v[6:7], off
        global_load_dword v23, v[8:9], off
        v_add_co_u32 v2, vcc_lo, v6, s2
        v_add_co_ci_u32_e32 v3, vcc_lo, s3, v7, vcc_lo
        v_add_co_u32 v4, vcc_lo, v8, s2
        v_add_co_ci_u32_e32 v5, vcc_lo, s3, v9, vcc_lo
        global_load_dword v6, v[2:3], off
        global_load_dword v4, v[4:5], off
        s_addc_u32 s1, s1, -1
        s_cmp_lg_u64 s[0:1], 0
        s_waitcnt vmcnt(14)
        v_add_nc_u32_e32 v2, v11, v10
        s_waitcnt vmcnt(12)
        v_add3_u32 v2, v2, v12, v13
        s_waitcnt vmcnt(10)
        v_add3_u32 v2, v2, v14, v15
        s_waitcnt vmcnt(8)
        v_add3_u32 v2, v2, v16, v17
        s_waitcnt vmcnt(6)
        v_add3_u32 v2, v2, v18, v19
        s_waitcnt vmcnt(4)
        v_add3_u32 v5, v2, v20, v21
        v_add_co_u32 v2, vcc_lo, s6, v0
        v_add_co_ci_u32_e32 v3, vcc_lo, s7, v1, vcc_lo
        s_waitcnt vmcnt(2)
        v_add3_u32 v5, v5, v22, v23
        v_add_co_u32 v0, vcc_lo, v0, s4
        v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
        s_waitcnt vmcnt(0)
        v_add3_u32 v4, v5, v6, v4
        global_store_dword v[2:3], v4, off
        s_cbranch_scc1 BB0_2
BB0_3:
        s_endpgm
        .section        .rodata,#alloc
        .p2align        6
        .amdhsa_kernel _Z19suboptimal_indexingPKiS0_Pimmm
                .amdhsa_group_segment_fixed_size 0
                .amdhsa_private_segment_fixed_size 0
                .amdhsa_kernarg_size 104
                .amdhsa_user_sgpr_private_segment_buffer 1
                .amdhsa_user_sgpr_dispatch_ptr 1
                .amdhsa_user_sgpr_queue_ptr 0
                .amdhsa_user_sgpr_kernarg_segment_ptr 1
                .amdhsa_user_sgpr_dispatch_id 0
                .amdhsa_user_sgpr_flat_scratch_init 0
                .amdhsa_user_sgpr_private_segment_size 0
                .amdhsa_wavefront_size32 1
                .amdhsa_system_sgpr_private_segment_wavefront_offset 0
                .amdhsa_system_sgpr_workgroup_id_x 1
                .amdhsa_system_sgpr_workgroup_id_y 0
                .amdhsa_system_sgpr_workgroup_id_z 0
                .amdhsa_system_sgpr_workgroup_info 0
                .amdhsa_system_vgpr_workitem_id 0
                .amdhsa_next_free_vgpr 24
                .amdhsa_next_free_sgpr 20
                .amdhsa_reserve_flat_scratch 0
                .amdhsa_float_round_mode_32 0
                .amdhsa_float_round_mode_16_64 0
                .amdhsa_float_denorm_mode_32 3
                .amdhsa_float_denorm_mode_16_64 3
                .amdhsa_dx10_clamp 1
                .amdhsa_ieee_mode 1
                .amdhsa_fp16_overflow 0
                .amdhsa_workgroup_processor_mode 1
                .amdhsa_memory_ordered 1
                .amdhsa_forward_progress 0
                .amdhsa_exception_fp_ieee_invalid_op 0
                .amdhsa_exception_fp_denorm_src 0
                .amdhsa_exception_fp_ieee_div_zero 0
                .amdhsa_exception_fp_ieee_overflow 0
                .amdhsa_exception_fp_ieee_underflow 0
                .amdhsa_exception_fp_ieee_inexact 0
                .amdhsa_exception_int_div_zero 0
        .end_amdhsa_kernel
        .text
.Lfunc_end0:
        .size   _Z19suboptimal_indexingPKiS0_Pimmm, .Lfunc_end0-_Z19suboptimal_indexingPKiS0_Pimmm
                                        ; -- End function
        .section        .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 564
; NumSgprs: 22
; NumVgprs: 24
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 2
; VGPRBlocks: 2
; NumSGPRsForWavesPerEU: 22
; NumVGPRsForWavesPerEU: 24
; Occupancy: 16
; WaveLimiterHint : 1
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
```
We can observe two issues with the code:
1) we use several vgprs for indexing the same array several times while we could just re-use the same vgpr-pair for the incremented address
2) incrementing the index with `globalThreadId += offset;` actually causes as many vgpr additions as there are accessed arrays while it could use the variant `global_load_dword, vdest, voffset, saddr` for the loads, which then would mean we would need to increment only the offset once for all arrays.

I think that both issues are quite a problem.

I hope that this can get fixed in a future release, if it has already been addressed, let me know and please disregard this.

Best regards,
Epliz.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJztWltT6zgS_jXhRUXKlziXBx6AAIcaOIcCzszuvKhkW0k8-DaSncv59dst2bFinAu8LLWzVEgsqfvri1pqyZKfhZuLb1HPue5Z0551qb_vyYItOcl8ycWSh0RmCSey9LO8iBIWkzlPuWAFtDApeeLHGzLLBFktWEEk54kkcfTGSS54UWxIkCVJlsJPCFWsKLhIJVlFxYJcxyydE3vQt_oWkkVQmoksIc8_rh-JB9VO39TrFoQUi0gSvmZJHnPyBlg8VtA9tyLqDa3qo4qUzuPMZzGlurzMotCwhUZpyNcguOeMo7QAqFQWPeeSMPAJ2anxqxp8DvBZRr84Lch34_naeM5mM8mBekJ6oystnNRtWqfXheAsvA9Jz50SP86Ct_tw3V8TJQ2L0yhRxSswW5NCc8_dokUzaBy_A7tGnXbEEtVBQFvJj5RIC6DU4zUojo8gSX1avEQ5Qmx5zJaecwsfwoKAS0ksswkFXiEP63lXu0r2vKkyy-9qaUlom6cxK-8eUsb-Sso4X0kZ9yspM_hKynhfSZnhV1Jm9F9WxngOuoUCq9jh6o2mdWH72Dzs5onXBe_Ka5Hcl1iqb8cllF4_XH6_oz9ubx9-XE7p1c_v04cbCn8vr5fPr5SSRZSfsyScByn-4P9CsvP5bG1brl3DTfoFXxdNSTPQgok5L8A3znuELYTjNHy5yAoegBVQoH_ak45k9_Rb9GLRpyhJEoIT__k5ueLzKCWzMg2KCLL1ccZGIHZFfJIwQ0mHxdE8hcexYf8m56fhwHplYNXKVon-OJer4wctBvaT1UVyx_P9vtXEgjWRNM5YSMNVJmBBMVCJH-JyADQeRjqUrfXAIA9iVkoOlXYnxnqs-G0HAOxJhYA1Q6gYbRGtbuaBIkUF3b2sTsO7pEm2pL7rUO46ZGkrAgN5xaIigJwfz9-SAJc8YwtWBQYBS0Nkrw0faAkz-DOIkhLcikRjRTSuSA2XJDmN57QcNurbtbqGriwMaRrQslbXauCWptaB5DFEvqHYud3GQpHzAkVqrCAAN9b-ssem85emSvtMNwAGLSIJi2i-5oGm9t5b7wuWBguKNL_I1ZVF3XbE2WbELWksF7Hg0HHgsGXLYc4BlZFPMbUjFMuOETSeaQJ4vUTNh4rSHr43MgwDTTLSJPrH2yfZaYWnWa4mMPSCUw_Ug3_KSS5klJtX3BLcpzB3k4csy8k3SAlcIMiU58UCiOxWMAWZUnvptHrQdloxtSUPoib83DaXrrCN6m5x7YCxB6eI89pc3l5xRqfpMWJVg6AmqPZCxsy1tK06cMzuyIyB3MVk10xmNJlMBx2ty9L5mJ912f2om3X5mLC2l7UY76iw4WcsG33SsvFnLJucbFlXRzufiQ73WHR0MQ1qJjNrHWPyaiacuScfjsPh5-NwdLS32nE4_nwcTo4K-1-Pw-Fn4nD0mTgcfyYOJ_-Pw39EHDqfyZbO0WzZxbSdeT8Qh8525v3nxWGHOz4zaQyO9ZW57lUdq7_PzW3dSRubZpu1rDZZmAMnLX-YGx_dXdXiyz6I5LxDcs0ur5CqH_cQknUSknab7R1AGp8EpPvMHh0AGp4EpCcHe3IA6L27KyDPAKqGvHPqJkIb8ME9hB493Wv6ttZ7u9YzZrJq-nDcPVpbrcGo9wuDw1rbLa0P7UTaWu8No4Gh9bCaxlvjUxaZ4PUAbQ1nzb87QOudtQwCm-gt5XZ36e6-xOFpmM-NF1OSV--UJn2RhazAc6Ce47I4zoKu11fDnRd2C8lodSj1gbdhBvNcZGVOJZ8nPC3oLFrzkOKZjTFtGNS5iJas4CfTo2pMzDWFbQ26aErJBZXzXLxD98vZDPbY9mGuMJI5K8D5eXGU9u-Sl1wRdqrbEG4Vr1Q5AXurRxQeQ5_FrIBYEZo8jYpjDG3P7Pf4ii35TGQVDYR7p9ZyIwuedGM3CPq9eLccEwGGyZuOoyik6-MSd-g3H8T_9SH6dJYdpF_W9BGW9vVcytcQ64JzRU-czjhuiKQi6kQSXB1x70RAt8wZrBEKClakIU2ykFPoy9MI7SGFZcAB2pCnmUi2qO5plBq2kzhc2xa-bk7y7s6POHgFQbqbZzlgZ0suQOhqT1hvuzQXGR4PZeIAYMKTTGwozN5c8HCP0EysmAgRby6Mo-RdKr4OeI7zMyipzYjSJUzFIc3yEzgq_0kRnAofRkv6i4s9Ufue_rDb3tNDlHyMIYK4ZsGeWaAhj2C-6NC9D-mO7uSppqk5feo_4NEKpsadE48-TmG9E89niAFyfmouPP7Wt3n3e35ObtKQ7B4CtTL45eP07ulnP5AqkW_frf-mMzTORo19UI_3SB54ep9ebQquzhO94aBp_l4mLzCV4Ikg2R646frft_UG_YueT17Qa9BkNS2PakBc4RzRarnFcf6o77MAmNFyD71fN9hN9cP0BbXdCiE-lGTPud0OUbz2EWRJHsWcgP85ydJ40yzGUNG7p-crvG-iTWgaft_XgK6ANnmbiT8gP8knLm5-dvmlm8hw0o8gKHNYrG2UXcOmAVkeogQSgfiG109adl__eHz6-XpDn-4e6fPL8zUuCV-uny9fr7_Rm-8tr3YR_3y5eaZoBNKOD9O-Pl8-0W-XeKz7fAL06939lP6rUuOIzor236eprGj_PJH2fnp_R7GFXn9_NRl2j7H_4CRgaX3XixSrjERSlry6olUs-O71KhtvB604waNMyWG2YzHBLCzVDaN6ZCs-ySDYmBBss6XE-APkBcbiCpHLOCR_lbIggp8j5JYPMc9zFgmFi9VRGgiOSyI8nw9DTBJaJdwPNa21dKWKNgKMPXbTYGgRmFVLWOZvwB-giSRMkoSlG6UJCoxwVlHVAC_QMl7djUCF0MzasqioLKstWjIRMYzhWhHjjYPaw4Qc75jBQ31pDDZXaCPqVduPLBJbQEigeiYlKyUm4dCD4E5dSjnoU2SNR9SAVxDVCjJLA65gwdxK8517dvd4yy59g29WED8DD1YhgSb_XcKQJIxAovZjnrQYF1nONZ-6qIehhbcX1MYENAK-WVmUACN4zJnkaE80Q48twLEsxv7ZEJ-DbVUfc-WhGEAgKt5SyJQMpv1ccRNY5As-h3WDErejyxVXUYWN6DVdeZPH0a_-WXjhhhN3ws6KqIj5xYpJDKoszzNRlLD-j-rw19G7jert9UWwROcW4rPgDXLcWSnii0VR5Oq-iLo4MweA0u_D1AuFOF7WP-fgub8gR0FRuxUePM-1B2eLi3AwDr2x7_NwNvG45Vszf-bNIAsF7oCNrclZzHweywvYA_ccJ-Ur3TN4_8ObnkUXjuU4lmeNrNHA8Ub9GQsmHhvZADYeWI7bG1g8YVHcRz36mZifiQulkl_OJTTGkSxk08ikhE0u50oc4LOyWGTiQjnxTAm-UIr_B0RCjsw">