<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/55314>55314</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
wasted opportunities with array indexing patterns in AMDGPU backend
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
Epliz
</td>
</tr>
</table>
<pre>
Hi,
I have observed some suboptimal generated assembly for what seems like pretty common code patterns with Clang 14.0.0 coming from ROCM 5.0.2.
For this example kernel code:
```
__global__
void suboptimal_indexing(int const* a, int const* b, int* c, size_t N, size_t C, size_t offset) {
size_t globalThreadId = blockIdx.x * blockDim.x + threadIdx.x;
if (globalThreadId < N) {
for (size_t i = 0; i < C; i++) {
int r = 0;
// access 0
r += a[globalThreadId] + b[globalThreadId];
globalThreadId += offset;
// access 1
r += a[globalThreadId] + b[globalThreadId];
globalThreadId += offset;
// access 2
r += a[globalThreadId] + b[globalThreadId];
globalThreadId += offset;
// access 3
r += a[globalThreadId] + b[globalThreadId];
globalThreadId += offset;
// access 4
r += a[globalThreadId] + b[globalThreadId];
globalThreadId += offset;
// access 5
r += a[globalThreadId] + b[globalThreadId];
globalThreadId += offset;
// access 6
r += a[globalThreadId] + b[globalThreadId];
globalThreadId += offset;
// access 7
r += a[globalThreadId] + b[globalThreadId];
globalThreadId += offset;
c[globalThreadId] = r;
}
}
}
```
The generated assembly is:
```
# __CLANG_OFFLOAD_BUNDLE____START__ hip-amdgcn-amd-amdhsa-gfx1031
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx1031"
.protected _Z19suboptimal_indexingPKiS0_Pimmm ; -- Begin function _Z19suboptimal_indexingPKiS0_Pimmm
.globl _Z19suboptimal_indexingPKiS0_Pimmm
.p2align 8
.type _Z19suboptimal_indexingPKiS0_Pimmm,@function
_Z19suboptimal_indexingPKiS0_Pimmm: ; @_Z19suboptimal_indexingPKiS0_Pimmm
; %bb.0:
s_load_dword s4, s[4:5], 0x4
s_clause 0x1
s_load_dwordx8 s[12:19], s[6:7], 0x0
s_load_dwordx4 s[0:3], s[6:7], 0x20
v_mov_b32_e32 v1, 0
s_waitcnt lgkmcnt(0)
s_and_b32 s4, s4, 0xffff
s_mul_i32 s8, s8, s4
s_cmp_lg_u64 s[0:1], 0
v_add_nc_u32_e32 v0, s8, v0
s_cselect_b32 s4, -1, 0
v_cmp_gt_u64_e32 vcc_lo, s[18:19], v[0:1]
s_and_b32 s4, vcc_lo, s4
s_and_saveexec_b32 s5, s4
s_cbranch_execz BB0_3
; %bb.1:
v_lshlrev_b64 v[0:1], 2, v[0:1]
s_lshl_b64 s[4:5], s[2:3], 5
s_add_u32 s6, s16, s4
s_addc_u32 s7, s17, s5
s_lshl_b64 s[2:3], s[2:3], 2
BB0_2: ; =>This Inner Loop Header: Depth=1
v_add_co_u32 v2, vcc_lo, s12, v0
v_add_co_ci_u32_e32 v3, vcc_lo, s13, v1, vcc_lo
v_add_co_u32 v4, vcc_lo, s14, v0
v_add_co_ci_u32_e32 v5, vcc_lo, s15, v1, vcc_lo
s_add_u32 s0, s0, -1
global_load_dword v10, v[2:3], off
global_load_dword v11, v[4:5], off
v_add_co_u32 v2, vcc_lo, v2, s2
v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
v_add_co_u32 v4, vcc_lo, v4, s2
v_add_co_ci_u32_e32 v5, vcc_lo, s3, v5, vcc_lo
v_add_co_u32 v6, vcc_lo, v2, s2
v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
v_add_co_u32 v8, vcc_lo, v4, s2
v_add_co_ci_u32_e32 v9, vcc_lo, s3, v5, vcc_lo
global_load_dword v12, v[2:3], off
global_load_dword v13, v[4:5], off
global_load_dword v14, v[6:7], off
global_load_dword v15, v[8:9], off
v_add_co_u32 v2, vcc_lo, v6, s2
v_add_co_ci_u32_e32 v3, vcc_lo, s3, v7, vcc_lo
v_add_co_u32 v4, vcc_lo, v8, s2
v_add_co_ci_u32_e32 v5, vcc_lo, s3, v9, vcc_lo
v_add_co_u32 v6, vcc_lo, v2, s2
v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
v_add_co_u32 v8, vcc_lo, v4, s2
v_add_co_ci_u32_e32 v9, vcc_lo, s3, v5, vcc_lo
global_load_dword v16, v[2:3], off
global_load_dword v17, v[4:5], off
global_load_dword v18, v[6:7], off
global_load_dword v19, v[8:9], off
v_add_co_u32 v2, vcc_lo, v6, s2
v_add_co_ci_u32_e32 v3, vcc_lo, s3, v7, vcc_lo
v_add_co_u32 v4, vcc_lo, v8, s2
v_add_co_ci_u32_e32 v5, vcc_lo, s3, v9, vcc_lo
v_add_co_u32 v6, vcc_lo, v2, s2
v_add_co_ci_u32_e32 v7, vcc_lo, s3, v3, vcc_lo
v_add_co_u32 v8, vcc_lo, v4, s2
v_add_co_ci_u32_e32 v9, vcc_lo, s3, v5, vcc_lo
global_load_dword v20, v[2:3], off
global_load_dword v21, v[4:5], off
global_load_dword v22, v[6:7], off
global_load_dword v23, v[8:9], off
v_add_co_u32 v2, vcc_lo, v6, s2
v_add_co_ci_u32_e32 v3, vcc_lo, s3, v7, vcc_lo
v_add_co_u32 v4, vcc_lo, v8, s2
v_add_co_ci_u32_e32 v5, vcc_lo, s3, v9, vcc_lo
global_load_dword v6, v[2:3], off
global_load_dword v4, v[4:5], off
s_addc_u32 s1, s1, -1
s_cmp_lg_u64 s[0:1], 0
s_waitcnt vmcnt(14)
v_add_nc_u32_e32 v2, v11, v10
s_waitcnt vmcnt(12)
v_add3_u32 v2, v2, v12, v13
s_waitcnt vmcnt(10)
v_add3_u32 v2, v2, v14, v15
s_waitcnt vmcnt(8)
v_add3_u32 v2, v2, v16, v17
s_waitcnt vmcnt(6)
v_add3_u32 v2, v2, v18, v19
s_waitcnt vmcnt(4)
v_add3_u32 v5, v2, v20, v21
v_add_co_u32 v2, vcc_lo, s6, v0
v_add_co_ci_u32_e32 v3, vcc_lo, s7, v1, vcc_lo
s_waitcnt vmcnt(2)
v_add3_u32 v5, v5, v22, v23
v_add_co_u32 v0, vcc_lo, v0, s4
v_add_co_ci_u32_e32 v1, vcc_lo, s5, v1, vcc_lo
s_waitcnt vmcnt(0)
v_add3_u32 v4, v5, v6, v4
global_store_dword v[2:3], v4, off
s_cbranch_scc1 BB0_2
BB0_3:
s_endpgm
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel _Z19suboptimal_indexingPKiS0_Pimmm
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 104
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 1
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_flat_scratch_init 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_wavefront_size32 1
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 0
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 24
.amdhsa_next_free_sgpr 20
.amdhsa_reserve_flat_scratch 0
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_workgroup_processor_mode 1
.amdhsa_memory_ordered 1
.amdhsa_forward_progress 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size _Z19suboptimal_indexingPKiS0_Pimmm, .Lfunc_end0-_Z19suboptimal_indexingPKiS0_Pimmm
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 564
; NumSgprs: 22
; NumVgprs: 24
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 2
; VGPRBlocks: 2
; NumSGPRsForWavesPerEU: 22
; NumVGPRsForWavesPerEU: 24
; Occupancy: 16
; WaveLimiterHint : 1
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
```
We can observe two issues with the code:
1) we use several vgprs for indexing the same array several times while we could just re-use the same vgpr-pair for the incremented address
2) incrementing the index with `globalThreadId += offset;` actually causes as many vgpr additions as there are accessed arrays while it could use the variant `global_load_dword, vdest, voffset, saddr` for the loads, which then would mean we would need to increment only the offset once for all arrays.
I think that both issues are quite a problem.
I hope that this can get fixed in a future release, if it has already been addressed, let me know and please disregard this.
Best regards,
Epliz.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJztWltT6zgS_jXhRUXKlziXBx6AAIcaOIcCzszuvKhkW0k8-DaSncv59dst2bFinAu8LLWzVEgsqfvri1pqyZKfhZuLb1HPue5Z0551qb_vyYItOcl8ycWSh0RmCSey9LO8iBIWkzlPuWAFtDApeeLHGzLLBFktWEEk54kkcfTGSS54UWxIkCVJlsJPCFWsKLhIJVlFxYJcxyydE3vQt_oWkkVQmoksIc8_rh-JB9VO39TrFoQUi0gSvmZJHnPyBlg8VtA9tyLqDa3qo4qUzuPMZzGlurzMotCwhUZpyNcguOeMo7QAqFQWPeeSMPAJ2anxqxp8DvBZRr84Lch34_naeM5mM8mBekJ6oystnNRtWqfXheAsvA9Jz50SP86Ct_tw3V8TJQ2L0yhRxSswW5NCc8_dokUzaBy_A7tGnXbEEtVBQFvJj5RIC6DU4zUojo8gSX1avEQ5Qmx5zJaecwsfwoKAS0ksswkFXiEP63lXu0r2vKkyy-9qaUlom6cxK-8eUsb-Sso4X0kZ9yspM_hKynhfSZnhV1Jm9F9WxngOuoUCq9jh6o2mdWH72Dzs5onXBe_Ka5Hcl1iqb8cllF4_XH6_oz9ubx9-XE7p1c_v04cbCn8vr5fPr5SSRZSfsyScByn-4P9CsvP5bG1brl3DTfoFXxdNSTPQgok5L8A3znuELYTjNHy5yAoegBVQoH_ak45k9_Rb9GLRpyhJEoIT__k5ueLzKCWzMg2KCLL1ccZGIHZFfJIwQ0mHxdE8hcexYf8m56fhwHplYNXKVon-OJer4wctBvaT1UVyx_P9vtXEgjWRNM5YSMNVJmBBMVCJH-JyADQeRjqUrfXAIA9iVkoOlXYnxnqs-G0HAOxJhYA1Q6gYbRGtbuaBIkUF3b2sTsO7pEm2pL7rUO46ZGkrAgN5xaIigJwfz9-SAJc8YwtWBQYBS0Nkrw0faAkz-DOIkhLcikRjRTSuSA2XJDmN57QcNurbtbqGriwMaRrQslbXauCWptaB5DFEvqHYud3GQpHzAkVqrCAAN9b-ssem85emSvtMNwAGLSIJi2i-5oGm9t5b7wuWBguKNL_I1ZVF3XbE2WbELWksF7Hg0HHgsGXLYc4BlZFPMbUjFMuOETSeaQJ4vUTNh4rSHr43MgwDTTLSJPrH2yfZaYWnWa4mMPSCUw_Ug3_KSS5klJtX3BLcpzB3k4csy8k3SAlcIMiU58UCiOxWMAWZUnvptHrQdloxtSUPoib83DaXrrCN6m5x7YCxB6eI89pc3l5xRqfpMWJVg6AmqPZCxsy1tK06cMzuyIyB3MVk10xmNJlMBx2ty9L5mJ912f2om3X5mLC2l7UY76iw4WcsG33SsvFnLJucbFlXRzufiQ73WHR0MQ1qJjNrHWPyaiacuScfjsPh5-NwdLS32nE4_nwcTo4K-1-Pw-Fn4nD0mTgcfyYOJ_-Pw39EHDqfyZbO0WzZxbSdeT8Qh8525v3nxWGHOz4zaQyO9ZW57lUdq7_PzW3dSRubZpu1rDZZmAMnLX-YGx_dXdXiyz6I5LxDcs0ur5CqH_cQknUSknab7R1AGp8EpPvMHh0AGp4EpCcHe3IA6L27KyDPAKqGvHPqJkIb8ME9hB493Wv6ttZ7u9YzZrJq-nDcPVpbrcGo9wuDw1rbLa0P7UTaWu8No4Gh9bCaxlvjUxaZ4PUAbQ1nzb87QOudtQwCm-gt5XZ36e6-xOFpmM-NF1OSV--UJn2RhazAc6Ce47I4zoKu11fDnRd2C8lodSj1gbdhBvNcZGVOJZ8nPC3oLFrzkOKZjTFtGNS5iJas4CfTo2pMzDWFbQ26aErJBZXzXLxD98vZDPbY9mGuMJI5K8D5eXGU9u-Sl1wRdqrbEG4Vr1Q5AXurRxQeQ5_FrIBYEZo8jYpjDG3P7Pf4ii35TGQVDYR7p9ZyIwuedGM3CPq9eLccEwGGyZuOoyik6-MSd-g3H8T_9SH6dJYdpF_W9BGW9vVcytcQ64JzRU-czjhuiKQi6kQSXB1x70RAt8wZrBEKClakIU2ykFPoy9MI7SGFZcAB2pCnmUi2qO5plBq2kzhc2xa-bk7y7s6POHgFQbqbZzlgZ0suQOhqT1hvuzQXGR4PZeIAYMKTTGwozN5c8HCP0EysmAgRby6Mo-RdKr4OeI7zMyipzYjSJUzFIc3yEzgq_0kRnAofRkv6i4s9Ufue_rDb3tNDlHyMIYK4ZsGeWaAhj2C-6NC9D-mO7uSppqk5feo_4NEKpsadE48-TmG9E89niAFyfmouPP7Wt3n3e35ObtKQ7B4CtTL45eP07ulnP5AqkW_frf-mMzTORo19UI_3SB54ep9ebQquzhO94aBp_l4mLzCV4Ikg2R646frft_UG_YueT17Qa9BkNS2PakBc4RzRarnFcf6o77MAmNFyD71fN9hN9cP0BbXdCiE-lGTPud0OUbz2EWRJHsWcgP85ydJ40yzGUNG7p-crvG-iTWgaft_XgK6ANnmbiT8gP8knLm5-dvmlm8hw0o8gKHNYrG2UXcOmAVkeogQSgfiG109adl__eHz6-XpDn-4e6fPL8zUuCV-uny9fr7_Rm-8tr3YR_3y5eaZoBNKOD9O-Pl8-0W-XeKz7fAL06939lP6rUuOIzor236eprGj_PJH2fnp_R7GFXn9_NRl2j7H_4CRgaX3XixSrjERSlry6olUs-O71KhtvB604waNMyWG2YzHBLCzVDaN6ZCs-ySDYmBBss6XE-APkBcbiCpHLOCR_lbIggp8j5JYPMc9zFgmFi9VRGgiOSyI8nw9DTBJaJdwPNa21dKWKNgKMPXbTYGgRmFVLWOZvwB-giSRMkoSlG6UJCoxwVlHVAC_QMl7djUCF0MzasqioLKstWjIRMYzhWhHjjYPaw4Qc75jBQ31pDDZXaCPqVduPLBJbQEigeiYlKyUm4dCD4E5dSjnoU2SNR9SAVxDVCjJLA65gwdxK8517dvd4yy59g29WED8DD1YhgSb_XcKQJIxAovZjnrQYF1nONZ-6qIehhbcX1MYENAK-WVmUACN4zJnkaE80Q48twLEsxv7ZEJ-DbVUfc-WhGEAgKt5SyJQMpv1ccRNY5As-h3WDErejyxVXUYWN6DVdeZPH0a_-WXjhhhN3ws6KqIj5xYpJDKoszzNRlLD-j-rw19G7jert9UWwROcW4rPgDXLcWSnii0VR5Oq-iLo4MweA0u_D1AuFOF7WP-fgub8gR0FRuxUePM-1B2eLi3AwDr2x7_NwNvG45Vszf-bNIAsF7oCNrclZzHweywvYA_ccJ-Ur3TN4_8ObnkUXjuU4lmeNrNHA8Ub9GQsmHhvZADYeWI7bG1g8YVHcRz36mZifiQulkl_OJTTGkSxk08ikhE0u50oc4LOyWGTiQjnxTAm-UIr_B0RCjsw">