[llvm] [AMDGPU][True16][CodeGen] stop emitting spgr_lo16 from isel (PR #144819)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 19 13:44:18 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Brox Chen (broxigarchen)
<details>
<summary>Changes</summary>
When true16 is enabled, isel start to emit sgpr_lo16 register when a trunc/sext i16/i32 is generated, or a salu32 is used by vgpr16 or vice versa. And this causes a problem as sgpr_lo16 is not fully supported in the pipeline.
True16 mode works fine in -O3 mode since folding pass remove the sgpr_lo16 from the pipeline. However it hit a problem in -O0 mode, and the spgr_lo16 hit the spill/store and other pass in the later pipeline.
This patch did:
1. stop emitting sgpr_lo16 from isel
2. update codegen pattern to split uniformed/divergent pattern for i16/i32 conversion
3. update fix-sgpr-copy pass to address legalization requirement in true16 mode
----------------------------------------
Post it for review. There are still testing needed to be done in downstream branches.
---
Patch is 6.94 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144819.diff
43 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp (+1-1)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+11-3)
- (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp (+1-3)
- (modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+20-3)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+16460-28699)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+557-1170)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll (+96-204)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+2280-4715)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+2694-5639)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+191-398)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+103-209)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+4977-8778)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.576bit.ll (+1988-4894)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.640bit.ll (+1840-3422)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+688-1424)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.704bit.ll (+2054-3758)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+2266-4110)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+2514-4478)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+2712-4784)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+2952-5180)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+339-716)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll (+9)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+30-48)
- (modified) llvm/test/CodeGen/AMDGPU/br_cc.f16.ll (+15-20)
- (modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+46-99)
- (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+31-63)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.bf16.ll (+38-80)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+26-55)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+70-146)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll (+118-245)
- (modified) llvm/test/CodeGen/AMDGPU/frem.ll (+254-241)
- (modified) llvm/test/CodeGen/AMDGPU/function-returns.ll (+26-53)
- (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+257-474)
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+157-307)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+63-135)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll (+87-184)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.tbuffer.load.d16.ll (+9-13)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.tbuffer.load.d16.ll (+18-27)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.tbuffer.load.d16.ll (+10-15)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.tbuffer.load.d16.ll (+20-31)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i16.ll (+34-71)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+47-98)
- (modified) llvm/test/CodeGen/AMDGPU/select.f16.ll (+36-31)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 1bf5b4a241780..555455d704c84 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -1088,7 +1088,7 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
assert(MF.getSubtarget<GCNSubtarget>().useRealTrue16Insts() &&
"We do not expect to see 16-bit copies from VGPR to SGPR unless "
"we have 16-bit VGPRs");
- assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_LO16RegClass ||
+ assert(MRI->getRegClass(DstReg) == &AMDGPU::SGPR_32RegClass ||
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32RegClass ||
MRI->getRegClass(DstReg) == &AMDGPU::SReg_32_XM0RegClass);
// There is no V_READFIRSTLANE_B16, so legalize the dst/src reg to 32 bits
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2ebf8b99e9d7b..198695fc9f030 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7245,7 +7245,8 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, unsigned OpIdx,
MachineBasicBlock *MBB = MI.getParent();
// Legalize operands and check for size mismatch
if (!OpIdx || OpIdx >= MI.getNumExplicitOperands() ||
- OpIdx >= get(Opcode).getNumOperands())
+ OpIdx >= get(Opcode).getNumOperands() ||
+ get(Opcode).operands()[OpIdx].RegClass == -1)
return;
MachineOperand &Op = MI.getOperand(OpIdx);
@@ -7803,8 +7804,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// that copies will end up as machine instructions and not be
// eliminated.
addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
- MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
- MRI.clearKillFlags(Inst.getOperand(1).getReg());
+ Register NewDstReg = Inst.getOperand(1).getReg();
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ MRI.clearKillFlags(NewDstReg);
Inst.getOperand(0).setReg(DstReg);
// Make sure we don't leave around a dead VGPR->SGPR copy. Normally
// these are deleted later, but at -O0 it would leave a suspicious
@@ -7812,6 +7814,12 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
Inst.removeOperand(I);
Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
+ // Legalize t16 operand since replaceReg is called after addUsersToVALU
+ for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+ E = MRI.use_end();
+ I != E; ++I) {
+ legalizeOperandsVALUt16(*I->getParent(), MRI);
+ }
return;
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index e41189adfb46f..8062d1bfe3d20 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3553,9 +3553,7 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth == 16)
- return &AMDGPU::SGPR_LO16RegClass;
- if (BitWidth == 32)
+ if (BitWidth == 16 || BitWidth == 32)
return &AMDGPU::SReg_32RegClass;
if (BitWidth == 64)
return &AMDGPU::SReg_64RegClass;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 926df955881e0..ac6725ded4f10 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1470,17 +1470,34 @@ def : GCNPat<
>;
def : GCNPat<
- (i64 (anyext i16:$src)),
+ (i64 (UniformUnaryFrag<anyext> i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (COPY $src)), sub0,
+ (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+def : GCNPat<
+ (i64 (DivergentUnaryFrag<anyext> i16:$src)),
(REG_SEQUENCE VReg_64, $src, lo16, (i16 (IMPLICIT_DEF)), hi16, (i32 (IMPLICIT_DEF)), sub1)
>;
def : GCNPat<
- (i16 (trunc i32:$src)),
+ (i16 (UniformUnaryFrag<trunc> i32:$src)),
+ (COPY $src)
+>;
+
+def : GCNPat<
+ (i16 (DivergentUnaryFrag<trunc> i32:$src)),
(EXTRACT_SUBREG $src, lo16)
>;
def : GCNPat <
- (i16 (trunc i64:$src)),
+ (i16 (UniformUnaryFrag<trunc> i64:$src)),
+ (EXTRACT_SUBREG $src, sub0)
+>;
+
+def : GCNPat <
+ (i16 (DivergentUnaryFrag<trunc> i64:$src)),
(EXTRACT_SUBREG $src, lo16)
>;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index f729de82cb042..cb2f0f28a29d6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -10438,1974 +10438,951 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3
; GFX9-NEXT: ; kill: killed $sgpr47
; GFX9-NEXT: s_branch .LBB13_2
;
-; GFX11-TRUE16-LABEL: bitcast_v32i32_to_v128i8_scalar:
-; GFX11-TRUE16: ; %bb.0:
-; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_xor_saveexec_b32 s4, -1
-; GFX11-TRUE16-NEXT: s_clause 0x3
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v16, s32
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v17, s32 offset:4
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v18, s32 offset:8
-; GFX11-TRUE16-NEXT: scratch_store_b32 off, v19, s32 offset:12
-; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s30, 0
-; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s96, 0
-; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v15
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s40, v1
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s41, v2
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s31, 1
-; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s97, 1
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s14, v3
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s15, v4
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s12, v5
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s34, 2
-; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s98, 2
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v6
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s10, v7
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s11, v8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s35, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s99, 3
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v9
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v10
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s6, v11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s36, 4
-; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s100, 4
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s7, v12
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v13
-; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v14
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s37, 5
-; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s101, 5
-; GFX11-TRUE16-NEXT: s_mov_b32 s57, 0
-; GFX11-TRUE16-NEXT: s_and_b32 s42, vcc_lo, exec_lo
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18 : SGPR spill to VGPR lane
-; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19 : SGPR spill to VGPR lane
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s38, 6
-; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s102, 6
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s39, 7
-; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s103, 7
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s48, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v17, s104, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s49, 9
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s50, 10
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s51, 11
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s52, 12
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s53, 13
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s54, 14
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s55, 15
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s64, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s65, 17
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s66, 18
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s67, 19
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s68, 20
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s69, 21
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s70, 22
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s71, 23
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s80, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s81, 25
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s82, 26
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s83, 27
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s84, 28
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s85, 29
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s86, 30
-; GFX11-TRUE16-NEXT: v_writelane_b32 v16, s87, 31
-; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB13_4
-; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s103, s29, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s104, s28, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s34, s28, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s35, s27, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6
-; GFX11-TRUE16-NEXT: s_lshr_b32 s36, s27, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s37, s27, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s38, s26, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4
-; GFX11-TRUE16-NEXT: s_lshr_b32 s39, s26, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s48, s25, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s49, s25, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2
-; GFX11-TRUE16-NEXT: s_lshr_b32 s50, s25, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s51, s24, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s52, s24, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s53, s23, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s54, s23, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s55, s23, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s64, s22, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s65, s22, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s66, s21, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s67, s21, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s68, s21, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s69, s20, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s70, s20, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s71, s19, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s80, s19, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s81, s19, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s82, s18, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s43, s17, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s83, s17, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s84, s17, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s85, s16, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s44, s16, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s45, s3, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 17
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s86, s3, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s87, s3, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s46, s2, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s47, s2, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s96, s1, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s97, s1, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 15
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s10, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s98, s1, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s99, s0, 16
-; GFX11-TRUE16-NEXT: s_lshr_b32 s56, s0, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 14
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 3
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[12:13], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[60:61], s[14:15], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 13
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 16
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[62:63], s[40:41], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[72:73], s[28:29], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[74:75], s[26:27], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 12
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s13, 8
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[78:79], s[24:25], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[76:77], s[22:23], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[88:89], s[20:21], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 11
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 16
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[90:91], s[18:19], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[92:93], s[16:17], 24
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[94:95], s[2:3], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 10
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s12, 8
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[30:31], s[0:1], 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 9
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s15, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 7
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 16
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 6
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s14, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 5
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 4
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s41, 8
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 2
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s40, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s29, 24
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s18, 8
-; GFX11-TRUE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s57
-; GFX11-TRUE16-NEXT: s_cbranch_vccnz .LBB13_3
-; GFX11-TRUE16-NEXT: .LBB13_2: ; %cmp.true
-; GFX11-TRUE16-NEXT: s_add_i32 s5, s5, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s4, s4, 3
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 24
-; GFX11-TRUE16-NEXT: s_add_i32 s7, s7, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 1
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s6, s6, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s9, s9, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s8, s8, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s42, 0
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s5, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s11, s11, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 31
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s10, s10, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s13, s13, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s12, s12, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 30
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s4, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s15, s15, 3
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[4:5], 24
-; GFX11-TRUE16-NEXT: s_add_i32 s14, s14, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 29
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s41, s41, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s40, s40, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 28
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 9
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[6:7], 24
-; GFX11-TRUE16-NEXT: s_add_i32 s29, s29, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 27
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s7, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 6
-; GFX11-TRUE16-NEXT: s_add_i32 s1, s1, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s0, s0, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 26
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 7
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[8:9], 24
-; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 25
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s6, 8
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 4
-; GFX11-TRUE16-NEXT: s_add_i32 s2, s2, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s17, s17, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 24
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s59, 5
-; GFX11-TRUE16-NEXT: s_lshr_b64 s[58:59], s[10:11], 24
-; GFX11-TRUE16-NEXT: s_add_i32 s16, s16, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 23
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v19, s58, 2
-; GFX11-TRUE16-NEXT: s_add_i32 s19, s19, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s18, s18, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 22
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s9, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s21, s21, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s20, s20, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s23, s23, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 21
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 16
-; GFX11-TRUE16-NEXT: s_add_i32 s22, s22, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s25, s25, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s24, s24, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 20
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s8, 8
-; GFX11-TRUE16-NEXT: s_add_i32 s27, s27, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s26, s26, 3
-; GFX11-TRUE16-NEXT: s_add_i32 s28, s28, 3
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 19
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s100, s15, 24
-; GFX11-TRUE16-NEXT: s_lshr_b32 s101, s40, 8
-; GFX11-TRUE16-NEXT: s_lshr_b32 s102, s29, 16
-; GFX11-TRUE16-NEXT: v_writelane_b32 v18, s42, 18
-; GFX11-TRUE16-NEXT: s_lshr_b32 s42, s11, 16
-; GFX11-TRUE1...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/144819
More information about the llvm-commits
mailing list