<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/122111>122111</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[AMDGPU] Sub-optimized instruction selection by GISel
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
tyb0807
</td>
</tr>
</table>
<pre>
Given this minimal reproducer IR
```
; ModuleID = 'repro.ll'
source_filename = "whatever"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
target triple = "amdgcn-amd-amdhsa"
; Function Attrs: alwaysinline nofree norecurse nounwind
define dso_local void @"main"(ptr addrspace(1) %arg, ptr addrspace(5) %i85.out) #2 {
newFuncRoot:
br label %bb.split
bb.split: ; preds = %newFuncRoot
%i57 = load <8 x half>, ptr addrspace(1) %arg, align 2
%i67 = fmul <8 x half> %i57, splat (half 0xH31C5)
%i85 = shufflevector <8 x half> %i67, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
store <4 x half> %i85, ptr addrspace(5) %i85.out, align 8
br label %bb374.exitStub
bb374.exitStub: ; preds = %bb.split
ret void
}
attributes #2 = { alwaysinline nofree norecurse nounwind "amdgpu-flat-work-group-size"="256,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2" "denormal-fp-math-f32"="preserve-sign" "uniform-work-group-size"="true" }
```
Using this command: `llc -O3 -march=amdgcn -mcpu=gfx942 -print-after=finalize-isel -mtriple amdgcn-amd-hmcsa --global-isel repro.ll -o /dev/null 2> gisel.mir && cat gisel.mir`, I have this generated MIR:
```
# *** IR Dump After Finalize ISel and expand pseudo-instructions (finalize-isel) ***:
# Machine code for function main: IsSSA, TracksLiveness, Legalized, RegBankSelected, Selected
Function Live Ins: $sgpr13 in %2, $sgpr14 in %3
bb.1.newFuncRoot:
liveins: $vgpr0, $vgpr1, $vgpr2
%4:vgpr_32 = COPY $vgpr0
%5:vgpr_32 = COPY $vgpr1
%0:vreg_64_align2 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
%1:vgpr_32 = COPY $vgpr2
%94:sreg_32 = S_MOV_B32 835006917
%90:vgpr_32 = GLOBAL_LOAD_USHORT %0:vreg_64_align2, 0, 0, implicit $exec :: (load (s16) from %ir.arg, addrspace 1)
%92:vgpr_32 = GLOBAL_LOAD_USHORT %0:vreg_64_align2, 2, 0, implicit $exec :: (load (s16) from %ir.arg + 2, addrspace 1)
%23:sreg_32 = S_MOV_B32 16
%101:vgpr_32 = COPY %23:sreg_32
%67:vgpr_32 = V_LSHL_OR_B32_e64 %92:vgpr_32, %101:vgpr_32, %90:vgpr_32, implicit $exec
%86:vgpr_32 = GLOBAL_LOAD_USHORT %0:vreg_64_align2, 4, 0, implicit $exec :: (load (s16) from %ir.arg + 4, addrspace 1)
%88:vgpr_32 = GLOBAL_LOAD_USHORT %0:vreg_64_align2, 6, 0, implicit $exec :: (load (s16) from %ir.arg + 6, addrspace 1)
%104:vgpr_32 = COPY %23:sreg_32
%70:vgpr_32 = V_LSHL_OR_B32_e64 %88:vgpr_32, %104:vgpr_32, %86:vgpr_32, implicit $exec
%105:vgpr_32 = COPY %94:sreg_32
%62:vgpr_32 = nofpexcept V_PK_MUL_F16 8, %67:vgpr_32, 8, %105:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%106:vgpr_32 = COPY %94:sreg_32
%63:vgpr_32 = nofpexcept V_PK_MUL_F16 8, %70:vgpr_32, 8, %106:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%11:vreg_64_align2 = REG_SEQUENCE %62:vgpr_32, %subreg.sub0, %63:vgpr_32, %subreg.sub1
SCRATCH_STORE_DWORDX2 %11:vreg_64_align2, %1:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>) into %ir.i85.out, addrspace 5)
SI_RETURN
# End machine code for function main.
```
Whereas with `llc -O3 -march=amdgcn -mcpu=gfx942 -print-after=finalize-isel -mtriple amdgcn-amd-hmcsa repro.ll -o /dev/null 2> seldag.mir && cat seldag.mir`, I have this generated MIR:
```
# *** IR Dump After Finalize ISel and expand pseudo-instructions (finalize-isel) ***:
# Machine code for function main: IsSSA, TracksLiveness
Function Live Ins: $vgpr0 in %2, $vgpr1 in %3, $vgpr2 in %4
bb.0.newFuncRoot:
liveins: $vgpr0, $vgpr1, $vgpr2
%4:vgpr_32 = COPY $vgpr2
%3:vgpr_32 = COPY $vgpr1
%2:vgpr_32 = COPY $vgpr0
%22:sgpr_32 = IMPLICIT_DEF
%23:sgpr_32 = IMPLICIT_DEF
%27:vreg_64_align2 = REG_SEQUENCE %2:vgpr_32, %subreg.sub0, %3:vgpr_32, %subreg.sub1
%7:vreg_64_align2 = COPY %27:vreg_64_align2
%6:vgpr_32 = GLOBAL_LOAD_USHORT %7:vreg_64_align2, 0, 0, implicit $exec :: (load (s16) from %ir.arg, addrspace 1)
%9:vreg_64_align2 = COPY %27:vreg_64_align2
%8:vgpr_32 = GLOBAL_LOAD_USHORT %9:vreg_64_align2, 2, 0, implicit $exec :: (load (s16) from %ir.arg + 2, addrspace 1)
%11:vreg_64_align2 = COPY %27:vreg_64_align2
%10:vgpr_32 = GLOBAL_LOAD_USHORT %11:vreg_64_align2, 4, 0, implicit $exec :: (load (s16) from %ir.arg + 4, addrspace 1)
%13:vreg_64_align2 = COPY %27:vreg_64_align2
%12:vgpr_32 = GLOBAL_LOAD_USHORT %13:vreg_64_align2, 6, 0, implicit $exec :: (load (s16) from %ir.arg + 6, addrspace 1)
%14:sreg_32 = S_MOV_B32 84148480
%15:vgpr_32 = V_PERM_B32_e64 killed %12:vgpr_32, killed %10:vgpr_32, %14:sreg_32, implicit $exec
%16:sreg_32 = S_MOV_B32 12741
%17:vgpr_32 = nofpexcept V_PK_MUL_F16 8, killed %15:vgpr_32, 0, %16:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%18:vgpr_32 = V_PERM_B32_e64 killed %8:vgpr_32, killed %6:vgpr_32, %14:sreg_32, implicit $exec
%19:vgpr_32 = nofpexcept V_PK_MUL_F16 8, killed %18:vgpr_32, 0, %16:sreg_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
%24:sgpr_32 = IMPLICIT_DEF
%25:sgpr_32 = IMPLICIT_DEF
%26:vreg_64_align2 = REG_SEQUENCE %19:vgpr_32, %subreg.sub0, %17:vgpr_32, %subreg.sub1
%21:vreg_64_align2 = COPY %26:vreg_64_align2
SCRATCH_STORE_DWORDX2 killed %21:vreg_64_align2, %4:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into %ir.i85.out, addrspace 5)
SI_RETURN
# End machine code for function main.
```
GIsel generates `V_LSHL_OR_B32_e64` and not `V_PERM_B32_e64`, could this be an issue for performance?
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzcWUtz4j4S_zTKRSXKlo0xhxycABlqk39mIZn_7okSdmO0Iz9Wkkkyn35LfsUxJsNkHoedytiyutVPqfunginF4xTgEo2v0Hh2wQq9z-SlftlavjW52GbRy-UNP0CK9Z4rnPCUJ0xgCbnMoiIEiZcrZAXIs-o_K0DOFb7LokLAcoaRM8OITkr-kRCITpAVqKyQIWx2XEDKEqiZ6NOeaTiARJQiK9BMxqBxxDQT7CUrdMMGJEdO4Lnlg-R294MiJ3CqB8md7ofbZRt3KV73Y4KcwPYs5AR07LVPQ_ENhdZPkk_NYEqPGXmj5WB7pTByoG5FOjRqDq5f80y9WuChFUYOrURyGNtmdmxTcrCtUo55kQO1ShHmRdJSrOeStUNJMCY3Nkk5cgLji2GavgmoljwXbcxZEsVhSlgSmf97xSreOo2LIg01z1IcaC0VcgLMxBN7UTwVPAWcZjsJ5iUhLKQyoyJ94mmErCCCnWGJVLYRWcgEPmQ8wsi1EKUJ46nRQ_1cS8yiSKqchYCobyM6xYiOmYwRvcZ98rgmc388ygpdfTkUo8kVsoIUnozBqyzTyDEuYLyVWLAtCLNoux2pXHBdudd-OQE-958JSS4hUnX0xl2NRp0xbTwpqSJjEUbOtY-f8Z6JHXLmQx71HGaCxyk2GShleZWsXVKInqxalVmjcsE0RtQ3JGw9f3LsaxOo1iJ_XEpR-2K3E3CAUGdySJxXiusR8oyrLK0JLn7G3Oy2ufniDsWWoZiB3QxoM3CMy1aAlc4kNKu7-kw6z8hxExV_IKPOxB3BM9drXWybvL6ZOyu7R3ntbBWMJehy8xr5k1mlhWkt-bbQoOoNaNZNrs48Hs3BywuyE0yTp0x-JbHMipwo_g3M0XBmiFJTB-h1-aSdNWlGWJzLo8kwS3IB5rwSVh7bI44IdqwQmvy3gAKOqVzlTId7wqPTtFwfK94Dy4cJmdIhE2KQyJNc8JCbsjRIF5EiX0GmIIYMSgqheSx5RNRLGpLy_PRYSi8HRZuAV_HmEXkeJHMNyfep395Sn9gBFMlBEihes1jzRJBmMmGC7HKSML0nO4e2TLkEBfIAxLTjekGR8l0mk5PbQ8s6idWu7LTgR8XTuOrYYZYkLI3MQUCeJUSIyb2DScJkuEfOrKr_mCRhXiBnFu-epy7FmOSSp5qwnQaJnNmOp0zwb0C4AoFJUveQTvPYJ6FimJBYZFsmKr6m6WOSYUQXERwQXaSFELisILFhGiVcYkQ9RD0cMv06aRyh13iJ9-wAlSsxpCCZhgjfLVdVhX-DO6iDEQ2qP7xc4VmR5DgwLuBF7QBerkFglkYYnnPzyhUUUUZ4qrQsykNjTrT_xuGqINWCa73UwXcs3JtjHmYR4F0m8a7plmV_cwK8VOt1YLx4kCz8qm4NjgKlzMwtxKWCyHysIL5i6dc1CAh1NdWOraBtwmY9XqZlH0bUVXEubQfz1FSssuo2k2496bStzh4NNEfBD8BbcYc4l1YtxYztzpg2vcQAEDOxqWve9f3nf78urpnG7zDZDZOBWAcJ8cZzN2V9r3hX85vNev7Px_lf1_OewsqesSq2EuKRKra1tV19RzytPvsdo1r3pkadMlbVXOvN3f2XzZVDse-MLcub2pOW1-pJvLm9vwpuN7f3wWzzuP50v3oY9tPYaLWPpgoaW-AZQmyyU2bEryAE9ZXtmS24k1lSNkY5atBC0zZN9237fYkjP2oY_WnDMKJXlZhh86hzMsa212bLOpGvN8sbbm_SY_6yuV1_ut3cr4zYDXhuPy71Pnmjpp7r5nUoDo1S3_uJMLu_Jszu6TD7_k-Y5_0a87zT5tnWiVIymOBJ_6wNJrjrcpvg4wLSTdx7CbatE4XsTZlo92D_1KXZLofnEHKNv2w-_2Nz93i7Wdge9mszupvWTPmtyb16Zp16dA1Psgjed6a_Xd91xvlBZybWKWe8X-yMEWqf1zu84wN_3Dy6rp7oHuvrVfBw_WmzfrhfzTezv-9Xs3_RE4Y0fp_y-sil3py5D2xUKDvnrL4_Ub--QpnzZi6TU8xTndVHrntbag9cewtcLzer-cPj6q_6ak8dPE8jnLyLYEY9fPX3HiQwhZ-43v8AlvwglPwOeFQgIhb30ePr7P8zfDwNCUsY1kOEJepqAWEH09VzbgsSrT8AElumfoUZBIn9mjoIN811KFAdruXd59vl9fJhM5sv6pJRNZV3eEqmyXl15Zyy8v2qYqrmsL62Ex7T2wJ9VnMfWP8bsafpJR926Dy0ciz_z2DWUx3nDMfs864JJ1rJ78WJRq_zcc_Ou2cMaPhDEPOdy5xru77rW00Q-kDvy-bzfHXXYsuvXAiI-j4bzR2KdQw-u8jqPWzmnb4R0Yn7eoXtX3PeRWUd04YAZU_vrwOa_ZN8Mpb-qVB6H4_k9IMR8v9ghMpfcb7bi8bnMHnnNaxuWE52LHtyTsui36uExyaZpcMQ-jUBA2JrG9zfgaSV5_4Ifv4l8PlmaQBvA0KVQdBHt1jkWSW6TDNd0btHp4a0YVaIqEK0W8AsxVypotKfg9xlMmFpCMhZXESXTjR1puwCLu2J4zmu69v-xf6Sgtnx7sSiFhtHPkypMwV35058Cm4I9gW_pBYdW7bl265l2XQ0AY_61JqwKUwdPwTkWpAwLkZCHJJRJuOL0ohLm1Lbti_KH2hU-ZMypSk8VSYiStF4diEvzSKyLWKFXEtwpdWrGM21KH-LDu5mN58f0XiG18WWZLnmCf8GEe4gbYP2oRptX_CNgeYXhRSXe61zg1cRXSC6iLneF9tRmCWILoye-kVymf0HQo3oorROIbqozT9c0v8FAAD__1KK3qU">