[llvm] [AMDGPU] misched: avoid subregister dependencies (PR #140255)
Robert Imschweiler via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 1 02:40:26 PDT 2025
https://github.com/ro-i updated https://github.com/llvm/llvm-project/pull/140255
>From 8cec83b01141254f5e1eac425bc378af57905017 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Thu, 15 May 2025 12:01:36 -0500
Subject: [PATCH 1/4] [AMDGPU] misched: avoid subregister dependencies
There are some VOP3P instructions which operate on packed 32bit values
and can be configured (op_sel/op_sel_hi) to only use one of the values.
This patch adapts the scheduling dependencies so that a write to vgpr3,
for example, is not a data dependency for a read from vgpr2_vgpr3 in
case only vgpr2 is actually used.
---
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 61 ++++++++
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 11 ++
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 141 +++++++++---------
.../AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir | 138 ++++++++---------
.../CodeGen/AMDGPU/packed-dependencies.mir | 49 ++++++
5 files changed, 261 insertions(+), 139 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 931966b6df1df..9845e77c64a63 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -628,6 +628,62 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
return std::pair(MaxNumVGPRs, MaxNumAGPRs);
}
+bool GCNSubtarget::isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
+ MachineInstr *UseI,
+ int UseOpIdx) const {
+ if (!InstrInfo.isVOP3P(*UseI))
+ return true;
+ MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
+ if (!DefOp.isReg() || !DefOp.getReg().isPhysical())
+ return true;
+
+ AMDGPU::OpName UseModName;
+ if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(), AMDGPU::OpName::src0) ==
+ UseOpIdx)
+ UseModName = AMDGPU::OpName::src0_modifiers;
+ else if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(),
+ AMDGPU::OpName::src1) == UseOpIdx)
+ UseModName = AMDGPU::OpName::src1_modifiers;
+ else if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(),
+ AMDGPU::OpName::src2) == UseOpIdx)
+ UseModName = AMDGPU::OpName::src2_modifiers;
+ else
+ return true;
+ MachineOperand *UseOpMod = InstrInfo.getNamedOperand(*UseI, UseModName);
+ if (!UseOpMod)
+ return true;
+ // Check whether all parts of the register are being used (= op_sel and
+ // op_sel_hi differ). In that case we can return early.
+ auto OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
+ auto OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
+ if ((!OpSel || !OpSelHi) && (OpSel || OpSelHi))
+ return true;
+
+ MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
+ if (!UseOp.isReg() || !UseOp.getReg().isPhysical())
+ return true;
+ const SIRegisterInfo *TRI = getRegisterInfo();
+ const MachineRegisterInfo &MRI = UseI->getParent()->getParent()->getRegInfo();
+ MCRegister DefReg = DefOp.getReg().asMCReg();
+ MCRegister UseReg = UseOp.getReg().asMCReg();
+ // We specifically look for a packed 32bit Use and smaller Def.
+ if (TRI->getRegSizeInBits(UseReg, MRI) != 64 ||
+ TRI->getRegSizeInBits(DefReg, MRI) > 32)
+ return true;
+ SmallVector<MCRegUnit, 2> DefRegUnits(TRI->regunits(DefReg));
+ assert(DefRegUnits.size() <= 2 && "unexpected number of register units");
+ SmallVector<MCRegUnit, 4> UseRegUnits(TRI->regunits(UseReg));
+ assert(UseRegUnits.size() == 4 && "unexpected number of register units");
+
+ auto FindRegunit = [&DefRegUnits](MCRegUnit A, MCRegUnit B) {
+ return llvm::find_if(DefRegUnits, [A, B](MCRegUnit RU) {
+ return RU == A || RU == B;
+ }) != DefRegUnits.end();
+ };
+ return OpSel ? FindRegunit(UseRegUnits[2], UseRegUnits[3])
+ : FindRegunit(UseRegUnits[0], UseRegUnits[1]);
+}
+
void GCNSubtarget::adjustSchedDependency(
SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
const TargetSchedModel *SchedModel) const {
@@ -638,6 +694,11 @@ void GCNSubtarget::adjustSchedDependency(
MachineInstr *DefI = Def->getInstr();
MachineInstr *UseI = Use->getInstr();
+ if (!isRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx)) {
+ Dep = SDep(Def, SDep::Artificial);
+ return; // this is not a data dependency anymore
+ }
+
if (DefI->isBundle()) {
const SIRegisterInfo *TRI = getRegisterInfo();
auto Reg = Dep.getReg();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index fb12da513d359..140931e8e91cc 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -293,6 +293,17 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
SITargetLowering TLInfo;
SIFrameLowering FrameLowering;
+ /// From the (MI300) ISA:
+ /// "Packed 32-bit instructions operate on 2 dwords at a time and those
+ /// operands must be two-dword aligned (i.e. an even VGPR address). Output
+ /// modifiers are not supported for these instructions. OPSEL and OPSEL_HI
+ /// work to select the first or second DWORD for each source."
+ /// -> We can save dependencies on VGPRs by analyzing the operand selection.
+ /// See also
+ /// https://llvm.org/docs/AMDGPUModifierSyntax.html#amdgpu-synid-op-sel
+ bool isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
+ MachineInstr *UseI, int UseOpIdx) const;
+
public:
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM);
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
index aad6e031aa9ed..6c4ebef38057b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir
@@ -464,16 +464,10 @@
; GCN-NEXT: buffer_load_dwordx2 v[130:131], v64, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134
; GCN-NEXT: v_fma_f32 v48, s4, v48, -v134
- ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
; GCN-NEXT: v_fma_f32 v64, s4, v49, -v134
- ; GCN-NEXT: v_exp_f32_e32 v163, v57
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96
; GCN-NEXT: v_fma_f32 v66, s4, v50, -v134
- ; GCN-NEXT: v_exp_f32_e32 v164, v57
; GCN-NEXT: v_exp_f32_e32 v49, v48
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v64
; GCN-NEXT: v_fma_f32 v67, s4, v51, -v134
@@ -495,25 +489,27 @@
; GCN-NEXT: ds_read_b128 v[140:143], v139
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: v_exp_f32_e32 v54, v48
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v70
; GCN-NEXT: v_exp_f32_e32 v55, v48
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v71
- ; GCN-NEXT: ds_read_b128 v[144:147], v139 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: v_fma_f32 v66, s4, v56, -v134
; GCN-NEXT: v_exp_f32_e32 v56, v48
; GCN-NEXT: v_sub_f32_e32 v48, v65, v134
+ ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: v_cvt_f16_f32_e32 v64, v49
; GCN-NEXT: v_cvt_f16_f32_e32 v67, v50
; GCN-NEXT: v_cvt_f16_f32_e32 v68, v51
+ ; GCN-NEXT: v_fma_f32 v96, s4, v58, -v134
; GCN-NEXT: v_cvt_f16_f32_e32 v58, v52
; GCN-NEXT: v_mul_f32_e32 v48, 0x3fb8aa3b, v48
- ; GCN-NEXT: ds_read_b128 v[148:151], v139 offset:1152
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: v_exp_f32_e32 v48, v48
+ ; GCN-NEXT: v_fma_f32 v57, s4, v57, -v134
; GCN-NEXT: v_pack_b32_f16 v161, v68, v58
; GCN-NEXT: v_pack_b32_f16 v160, v64, v67
; GCN-NEXT: v_mul_f32_e32 v58, 0x3fb8aa3b, v66
@@ -521,9 +517,7 @@
; GCN-NEXT: ds_read_b128 v[152:155], v139 offset:1728
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134
- ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55
- ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56
+ ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
; GCN-NEXT: v_pk_mul_f32 v[64:65], v[64:65], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[66:67], v[66:67], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[68:69], v[68:69], v[48:49] op_sel_hi:[1,0]
@@ -532,10 +526,8 @@
; GCN-NEXT: v_pk_mul_f32 v[74:75], v[74:75], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[76:77], v[76:77], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[78:79], v[78:79], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: ; implicit-def: $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95
- ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134
+ ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v57
; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0]
@@ -543,10 +535,20 @@
; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[94:95], v[94:95], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79]
; GCN-NEXT: v_exp_f32_e32 v58, v58
- ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_fma_f32 v162, s4, v61, -v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v61, v55
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54
+ ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134
+ ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v163, v57
+ ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v96
+ ; GCN-NEXT: ; implicit-def: $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111
+ ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0]
@@ -554,35 +556,33 @@
; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[110:111], v[110:111], v[48:49] op_sel_hi:[1,0]
- ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57
- ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59
- ; GCN-NEXT: v_cvt_f16_f32_e32 v140, v53
- ; GCN-NEXT: v_cvt_f16_f32_e32 v141, v54
- ; GCN-NEXT: v_exp_f32_e32 v59, v57
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
- ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134
- ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111]
+ ; GCN-NEXT: v_exp_f32_e32 v164, v57
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v57, v56
; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[122:123], v[122:123], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[124:125], v[124:125], v[48:49] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[126:127], v[126:127], v[48:49] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_pack_b32_f16 v145, v61, v57
+ ; GCN-NEXT: v_mul_f32_e32 v57, 0x3fb8aa3b, v59
; GCN-NEXT: v_fma_f32 v148, s4, v62, -v134
- ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[152:153], v[160:161], v[112:127]
; GCN-NEXT: v_fma_f32 v152, s4, v63, -v134
+ ; GCN-NEXT: v_pack_b32_f16 v144, v140, v141
+ ; GCN-NEXT: v_exp_f32_e32 v59, v57
; GCN-NEXT: v_mul_f32_e32 v149, 0x3fb8aa3b, v60
; GCN-NEXT: ; implicit-def: $vgpr57
; GCN-NEXT: ds_read_b128 v[60:63], v57
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_exp_f32_e32 v160, v149
; GCN-NEXT: v_fma_f32 v161, s4, v33, -v134
; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v148
- ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[142:143], v[144:145], v[64:79]
+ ; GCN-NEXT: v_exp_f32_e32 v160, v149
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v153, v58
; GCN-NEXT: v_fma_f32 v32, s4, v32, -v134
; GCN-NEXT: ds_read_b128 v[140:143], v57 offset:576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
@@ -590,22 +590,20 @@
; GCN-NEXT: v_fma_f32 v40, s4, v40, -v134
; GCN-NEXT: v_fma_f32 v44, s4, v44, -v134
; GCN-NEXT: v_fma_f32 v16, s4, v16, -v134
- ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134
- ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[146:147], v[144:145], v[80:95]
; GCN-NEXT: v_mul_f32_e32 v146, 0x3fb8aa3b, v162
; GCN-NEXT: v_cvt_f16_f32_e32 v147, v163
; GCN-NEXT: v_exp_f32_e32 v162, v146
; GCN-NEXT: v_cvt_f16_f32_e32 v146, v164
- ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134
+ ; GCN-NEXT: v_fma_f32 v166, s4, v20, -v134
; GCN-NEXT: v_pack_b32_f16 v148, v153, v147
- ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134
+ ; GCN-NEXT: v_fma_f32 v24, s4, v24, -v134
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[150:151], v[144:145], v[96:111]
; GCN-NEXT: v_exp_f32_e32 v151, v33
; GCN-NEXT: v_cvt_f16_f32_e32 v33, v59
; GCN-NEXT: v_fma_f32 v150, s4, v34, -v134
- ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134
- ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134
+ ; GCN-NEXT: v_fma_f32 v28, s4, v28, -v134
+ ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v134
; GCN-NEXT: v_pack_b32_f16 v149, v146, v33
; GCN-NEXT: v_mul_f32_e32 v33, 0x3fb8aa3b, v152
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[154:155], v[144:145], v[112:127]
@@ -614,6 +612,8 @@
; GCN-NEXT: v_fma_f32 v155, s4, v36, -v134
; GCN-NEXT: v_perm_b32 v36, v158, v156, s5
; GCN-NEXT: v_cvt_f16_f32_e32 v154, v160
+ ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v134
+ ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v134
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[60:61], v[148:149], v[64:79]
; GCN-NEXT: v_mul_f32_e32 v60, 0x3fb8aa3b, v32
; GCN-NEXT: ds_read_b128 v[32:35], v57 offset:1152
@@ -787,12 +787,14 @@
; GCN-NEXT: v_cvt_f16_f32_e32 v45, v158
; GCN-NEXT: v_perm_b32 v21, v148, v144, s5
; GCN-NEXT: v_perm_b32 v37, v148, v144, s8
- ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: ds_write_b64 v135, v[20:21]
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: ds_write_b64 v136, v[36:37]
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[16:17], v[40:41], v[96:111]
; GCN-NEXT: v_perm_b32 v16, v141, v131, s5
; GCN-NEXT: v_fma_f32 v131, s4, v22, -v134
@@ -802,23 +804,19 @@
; GCN-NEXT: v_perm_b32 v17, v149, v145, s5
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v136, v[36:37]
+ ; GCN-NEXT: ds_write_b64 v137, v[16:17]
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[32:33], v[40:41], v[112:127]
; GCN-NEXT: v_pack_b32_f16 v33, v45, v22
; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v60
; GCN-NEXT: v_exp_f32_e32 v144, v22
- ; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: ds_write_b64 v137, v[16:17]
; GCN-NEXT: ; implicit-def: $vgpr17
; GCN-NEXT: ; implicit-def: $vgpr22
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v44, v63
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b64 v138, v[42:43]
; GCN-NEXT: v_add_u32_e32 v22, v132, v22
; GCN-NEXT: v_add_u32_e32 v17, v132, v17
- ; GCN-NEXT: ; implicit-def: $vgpr20
- ; GCN-NEXT: ; implicit-def: $vgpr21
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_load_dwordx2 v[40:41], v22, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -826,9 +824,11 @@
; GCN-NEXT: buffer_load_dwordx2 v[42:43], v17, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: ; implicit-def: $vgpr20
+ ; GCN-NEXT: ; implicit-def: $vgpr21
+ ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44
; GCN-NEXT: v_add_u32_e32 v20, v132, v20
; GCN-NEXT: v_add_u32_e32 v21, v132, v21
- ; GCN-NEXT: v_pack_b32_f16 v32, v61, v44
; GCN-NEXT: buffer_load_dwordx2 v[44:45], v20, s[0:3], 0 offen sc0 sc1
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
@@ -959,27 +959,27 @@
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b64 v136, v[20:21]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
+ ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6
+ ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b64 v137, v[0:1]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b64 v138, v[26:27]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[16:17], v[24:25], v[112:127]
- ; GCN-NEXT: v_pack_b32_f16 v17, v40, v6
- ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v32
+ ; GCN-NEXT: v_exp_f32_e32 v25, v6
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_pack_b32_f16 v16, v37, v28
; GCN-NEXT: v_fma_f32 v24, s4, v7, -v134
- ; GCN-NEXT: v_exp_f32_e32 v25, v6
+ ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_b128 v[4:7], v139
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[22:23], v[16:17], v[64:79]
- ; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v149
; GCN-NEXT: v_exp_f32_e32 v26, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v29
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v150
@@ -998,13 +998,13 @@
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v25
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[18:19], v[16:17], v[112:127]
; GCN-NEXT: v_pack_b32_f16 v17, v2, v0
- ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v24
- ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134
+ ; GCN-NEXT: v_pack_b32_f16 v16, v1, v27
; GCN-NEXT: v_exp_f32_e32 v19, v0
; GCN-NEXT: ds_read_b128 v[0:3], v139 offset:1152
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_fma_f32 v18, s4, v11, -v134
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[16:17], v[64:79]
; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v8
; GCN-NEXT: ds_read_b128 v[8:11], v139 offset:1728
@@ -1013,41 +1013,41 @@
; GCN-NEXT: v_exp_f32_e32 v24, v4
; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v28
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v26
- ; GCN-NEXT: v_exp_f32_e32 v27, v4
- ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18
+ ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[20:21], v[16:17], v[80:95]
+ ; GCN-NEXT: v_exp_f32_e32 v27, v4
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v29
+ ; GCN-NEXT: v_mul_f32_e32 v4, 0x3fb8aa3b, v18
; GCN-NEXT: v_fma_f32 v21, s4, v13, -v134
- ; GCN-NEXT: v_fma_f32 v28, s4, v14, -v134
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[0:1], v[16:17], v[96:111]
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v30
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31
; GCN-NEXT: v_exp_f32_e32 v30, v0
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v31
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v19
; GCN-NEXT: v_pack_b32_f16 v1, v1, v0
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[8:9], v[16:17], v[112:127]
; GCN-NEXT: v_exp_f32_e32 v16, v4
; GCN-NEXT: v_pack_b32_f16 v0, v5, v20
; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v12
- ; GCN-NEXT: v_exp_f32_e32 v18, v9
- ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21
- ; GCN-NEXT: v_exp_f32_e32 v21, v9
; GCN-NEXT: v_fma_f32 v8, s4, v15, -v134
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+ ; GCN-NEXT: v_exp_f32_e32 v18, v9
+ ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v21
; GCN-NEXT: ds_read_b128 v[4:7], v57
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: ds_read_b128 v[12:15], v57 offset:576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v17, v24
- ; GCN-NEXT: v_cvt_f16_f32_e32 v20, v27
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[22:23], v[0:1], v[80:95]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21
+ ; GCN-NEXT: v_exp_f32_e32 v21, v9
; GCN-NEXT: v_cvt_f16_f32_e32 v23, v18
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v21
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[2:3], v[0:1], v[96:111]
- ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v28
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v3, v30
; GCN-NEXT: v_exp_f32_e32 v2, v2
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[112:127], v[10:11], v[0:1], v[112:127]
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v16
@@ -1102,29 +1102,30 @@
; GCN-NEXT: v_add_f32_e32 v3, v36, v3
; GCN-NEXT: v_add_f32_e32 v3, v39, v3
; GCN-NEXT: v_add_f32_e32 v3, v148, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
; GCN-NEXT: v_add_f32_e32 v3, v34, v3
; GCN-NEXT: v_add_f32_e32 v3, v150, v3
- ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10
- ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2
; GCN-NEXT: v_add_f32_e32 v3, v38, v3
; GCN-NEXT: v_add_f32_e32 v3, v42, v3
; GCN-NEXT: v_add_f32_e32 v3, v25, v3
; GCN-NEXT: v_add_f32_e32 v3, v26, v3
- ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1
- ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22
; GCN-NEXT: v_add_f32_e32 v3, v29, v3
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
; GCN-NEXT: v_add_f32_e32 v3, v31, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
; GCN-NEXT: v_add_f32_e32 v3, v19, v3
; GCN-NEXT: v_add_f32_e32 v3, v24, v3
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v1, v10
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v11, v2
; GCN-NEXT: v_add_f32_e32 v3, v27, v3
; GCN-NEXT: v_add_f32_e32 v3, v30, v3
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[12:13], v[8:9], v[80:95]
; GCN-NEXT: v_add_f32_e32 v3, v16, v3
; GCN-NEXT: v_add_f32_e32 v3, v18, v3
+ ; GCN-NEXT: v_pack_b32_f16 v1, v11, v1
+ ; GCN-NEXT: v_pack_b32_f16 v0, v23, v22
; GCN-NEXT: v_add_f32_e32 v3, v21, v3
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[4:5], v[8:9], v[64:79]
+ ; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[6:7], v[0:1], v[64:79]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[14:15], v[0:1], v[80:95]
; GCN-NEXT: v_add_f32_e32 v0, v2, v3
; GCN-NEXT: v_add_f32_e32 v4, v10, v0
; GCN-NEXT: ds_bpermute_b32 v5, v133, v4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
index 0887fdf0844b0..4e16ff82f1e60 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.small.mir
@@ -218,54 +218,48 @@
; GCN-NEXT: v_max_f32_e32 v70, v70, v70
; GCN-NEXT: v_max_f32_e32 v72, v81, v70
; GCN-NEXT: v_fma_f32 v16, s4, v16, -v72
+ ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72
; GCN-NEXT: v_fma_f32 v18, s4, v18, -v72
; GCN-NEXT: v_fma_f32 v19, s4, v19, -v72
; GCN-NEXT: v_mul_f32_e32 v16, 0x3fb8aa3b, v16
+ ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17
; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v18
; GCN-NEXT: v_mul_f32_e32 v19, 0x3fb8aa3b, v19
- ; GCN-NEXT: v_fma_f32 v17, s4, v17, -v72
; GCN-NEXT: v_fma_f32 v20, s4, v20, -v72
; GCN-NEXT: v_fma_f32 v21, s4, v21, -v72
; GCN-NEXT: v_fma_f32 v22, s4, v22, -v72
; GCN-NEXT: v_fma_f32 v23, s4, v23, -v72
; GCN-NEXT: v_exp_f32_e32 v73, v16
+ ; GCN-NEXT: v_exp_f32_e32 v17, v17
; GCN-NEXT: v_exp_f32_e32 v74, v18
; GCN-NEXT: v_exp_f32_e32 v75, v19
; GCN-NEXT: v_mul_f32_e32 v20, 0x3fb8aa3b, v20
; GCN-NEXT: v_mul_f32_e32 v21, 0x3fb8aa3b, v21
; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v22
- ; GCN-NEXT: v_exp_f32_e32 v80, v20
; GCN-NEXT: v_cvt_f16_f32_e32 v16, v73
; GCN-NEXT: v_fma_f32 v18, s4, v24, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v80, v20
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17
+ ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72
; GCN-NEXT: v_exp_f32_e32 v81, v21
; GCN-NEXT: v_cvt_f16_f32_e32 v21, v74
- ; GCN-NEXT: v_fma_f32 v20, s4, v25, -v72
+ ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72
; GCN-NEXT: v_exp_f32_e32 v82, v22
; GCN-NEXT: v_cvt_f16_f32_e32 v22, v75
- ; GCN-NEXT: v_mul_f32_e32 v17, 0x3fb8aa3b, v17
; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23
- ; GCN-NEXT: v_fma_f32 v26, s4, v26, -v72
- ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22
- ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18
; GCN-NEXT: v_sub_f32_e32 v24, v67, v72
- ; GCN-NEXT: v_exp_f32_e32 v83, v23
+ ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19
+ ; GCN-NEXT: v_pack_b32_f16 v71, v21, v22
; GCN-NEXT: v_fma_f32 v67, s4, v27, -v72
- ; GCN-NEXT: v_exp_f32_e32 v85, v22
- ; GCN-NEXT: v_exp_f32_e32 v17, v17
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24
+ ; GCN-NEXT: v_exp_f32_e32 v83, v23
+ ; GCN-NEXT: v_mul_f32_e32 v22, 0x3fb8aa3b, v18
; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v20
- ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v17
- ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72
- ; GCN-NEXT: v_exp_f32_e32 v88, v23
- ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72
- ; GCN-NEXT: v_pack_b32_f16 v70, v16, v19
; GCN-NEXT: ds_read_b128 v[18:21], v84
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24
; GCN-NEXT: v_exp_f32_e32 v16, v24
- ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576
- ; GCN-NEXT: s_waitcnt lgkmcnt(0)
- ; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2
; GCN-NEXT: v_pk_mul_f32 v[48:49], v[48:49], v[16:17] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[50:51], v[50:51], v[16:17] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[52:53], v[52:53], v[16:17] op_sel_hi:[1,0]
@@ -276,30 +270,35 @@
; GCN-NEXT: v_pk_mul_f32 v[62:63], v[62:63], v[16:17] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[32:33], v[32:33], v[16:17] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[34:35], v[34:35], v[16:17] op_sel_hi:[1,0]
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
+ ; GCN-NEXT: v_add_f32_e32 v18, 0, v73
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80
+ ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v85, v22
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81
+ ; GCN-NEXT: v_fma_f32 v87, s4, v29, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v88, v23
+ ; GCN-NEXT: ds_read_b128 v[22:25], v84 offset:576
+ ; GCN-NEXT: s_waitcnt lgkmcnt(0)
+ ; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: v_pk_mul_f32 v[36:37], v[36:37], v[16:17] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[38:39], v[38:39], v[16:17] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[40:41], v[40:41], v[16:17] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[42:43], v[42:43], v[16:17] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[44:45], v[44:45], v[16:17] op_sel_hi:[1,0]
; GCN-NEXT: v_pk_mul_f32 v[46:47], v[46:47], v[16:17] op_sel_hi:[1,0]
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[18:19], v[70:71], v[48:63]
- ; GCN-NEXT: v_add_f32_e32 v18, 0, v73
- ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83
- ; GCN-NEXT: v_fma_f32 v73, s4, v28, -v72
- ; GCN-NEXT: v_cvt_f16_f32_e32 v19, v80
- ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72
- ; GCN-NEXT: v_perm_b32 v90, v69, v65, s2
+ ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3
+ ; GCN-NEXT: s_nop 0
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[22:23], v[70:71], v[32:47]
; GCN-NEXT: v_add_f32_e32 v17, v17, v18
; GCN-NEXT: v_mul_f32_e32 v18, 0x3fb8aa3b, v26
- ; GCN-NEXT: v_cvt_f16_f32_e32 v86, v81
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82
; GCN-NEXT: v_fma_f32 v23, s4, v30, -v72
; GCN-NEXT: v_exp_f32_e32 v30, v18
- ; GCN-NEXT: v_cvt_f16_f32_e32 v22, v82
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v89, v83
; GCN-NEXT: v_fma_f32 v18, s4, v31, -v72
; GCN-NEXT: v_perm_b32 v31, v68, v64, s2
; GCN-NEXT: v_perm_b32 v64, v68, v64, s3
- ; GCN-NEXT: v_perm_b32 v65, v69, v65, s3
; GCN-NEXT: ds_read_b128 v[26:29], v91
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
@@ -322,31 +321,37 @@
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b32 v78, v90
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
; GCN-NEXT: buffer_wbl2 sc0 sc1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_write_b32 v79, v65
; GCN-NEXT: v_mul_f32_e32 v64, 0x3fb8aa3b, v73
; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v87
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[20:21], v[18:19], v[48:63]
+ ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23
; GCN-NEXT: v_add_f32_e32 v17, v74, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v20, v85
- ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72
+ ; GCN-NEXT: v_fma_f32 v0, s4, v0, -v72
; GCN-NEXT: v_exp_f32_e32 v22, v64
; GCN-NEXT: v_cvt_f16_f32_e32 v21, v88
+ ; GCN-NEXT: v_fma_f32 v1, s4, v1, -v72
; GCN-NEXT: v_exp_f32_e32 v64, v65
- ; GCN-NEXT: v_mul_f32_e32 v23, 0x3fb8aa3b, v23
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[24:25], v[18:19], v[32:47]
; GCN-NEXT: v_add_f32_e32 v17, v75, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v18, v30
- ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72
+ ; GCN-NEXT: v_fma_f32 v2, s4, v2, -v72
; GCN-NEXT: v_exp_f32_e32 v23, v23
; GCN-NEXT: v_cvt_f16_f32_e32 v19, v31
+ ; GCN-NEXT: v_fma_f32 v24, s4, v3, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v25, v67
; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v0
; GCN-NEXT: v_mul_f32_e32 v65, 0x3fb8aa3b, v1
; GCN-NEXT: v_pack_b32_f16 v0, v20, v21
; GCN-NEXT: v_pack_b32_f16 v1, v18, v19
- ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72
- ; GCN-NEXT: v_exp_f32_e32 v25, v67
+ ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
+ ; GCN-NEXT: ;;#ASMSTART
+ ; GCN-NEXT: s_waitcnt vmcnt(8)
+ ; GCN-NEXT: ;;#ASMEND
+ ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[26:27], v[0:1], v[48:63]
; GCN-NEXT: v_add_f32_e32 v17, v80, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v18, v22
@@ -356,62 +361,76 @@
; GCN-NEXT: v_fma_f32 v67, s4, v5, -v72
; GCN-NEXT: v_exp_f32_e32 v65, v65
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[68:69], v[0:1], v[32:47]
- ; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8aa3b, v2
; GCN-NEXT: v_add_f32_e32 v17, v81, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v23
- ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72
+ ; GCN-NEXT: v_fma_f32 v6, s4, v6, -v72
; GCN-NEXT: v_exp_f32_e32 v68, v2
; GCN-NEXT: v_cvt_f16_f32_e32 v19, v25
- ; GCN-NEXT: ;;#ASMSTART
- ; GCN-NEXT: s_waitcnt vmcnt(8)
- ; GCN-NEXT: ;;#ASMEND
- ; GCN-NEXT: v_mul_f32_e32 v24, 0x3fb8aa3b, v24
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: ds_read_b128 v[0:3], v84
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
; GCN-NEXT: v_pack_b32_f16 v4, v18, v4
+ ; GCN-NEXT: v_fma_f32 v7, s4, v7, -v72
; GCN-NEXT: v_pack_b32_f16 v5, v5, v19
; GCN-NEXT: v_exp_f32_e32 v24, v24
; GCN-NEXT: ds_read_b128 v[18:21], v84 offset:576
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
; GCN-NEXT: v_mul_f32_e32 v26, 0x3fb8aa3b, v26
; GCN-NEXT: v_mul_f32_e32 v67, 0x3fb8aa3b, v67
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[28:29], v[4:5], v[48:63]
+ ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6
; GCN-NEXT: v_add_f32_e32 v17, v82, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v28, v27
+ ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72
; GCN-NEXT: v_exp_f32_e32 v26, v26
; GCN-NEXT: v_cvt_f16_f32_e32 v29, v65
- ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72
+ ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72
; GCN-NEXT: v_exp_f32_e32 v67, v67
- ; GCN-NEXT: v_mul_f32_e32 v6, 0x3fb8aa3b, v6
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[70:71], v[4:5], v[32:47]
; GCN-NEXT: v_add_f32_e32 v17, v83, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v5, v68
+ ; GCN-NEXT: v_fma_f32 v10, s4, v10, -v72
; GCN-NEXT: v_exp_f32_e32 v6, v6
; GCN-NEXT: v_cvt_f16_f32_e32 v69, v24
; GCN-NEXT: v_mul_f32_e32 v7, 0x3fb8aa3b, v7
+ ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72
; GCN-NEXT: v_exp_f32_e32 v7, v7
; GCN-NEXT: v_pack_b32_f16 v4, v28, v29
; GCN-NEXT: v_pack_b32_f16 v5, v5, v69
- ; GCN-NEXT: ; implicit-def: $sgpr2
- ; GCN-NEXT: s_nop 1
+ ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8
+ ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[4:5], v[48:63]
; GCN-NEXT: v_add_f32_e32 v0, v85, v17
; GCN-NEXT: v_cvt_f16_f32_e32 v17, v26
+ ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v8, v8
; GCN-NEXT: v_cvt_f16_f32_e32 v28, v67
+ ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v9, v9
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[18:19], v[4:5], v[32:47]
; GCN-NEXT: v_add_f32_e32 v4, v88, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v10
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v6
+ ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72
; GCN-NEXT: v_exp_f32_e32 v10, v0
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v7
+ ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11
+ ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72
+ ; GCN-NEXT: v_exp_f32_e32 v11, v11
; GCN-NEXT: v_pack_b32_f16 v1, v1, v0
; GCN-NEXT: v_pack_b32_f16 v0, v17, v28
+ ; GCN-NEXT: ; implicit-def: $sgpr2
; GCN-NEXT: s_nop 1
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
+ ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12
; GCN-NEXT: v_add_f32_e32 v2, v30, v4
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8
+ ; GCN-NEXT: v_exp_f32_e32 v12, v3
+ ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9
+ ; GCN-NEXT: v_exp_f32_e32 v15, v3
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[20:21], v[0:1], v[32:47]
; GCN-NEXT: v_add_f32_e32 v0, v31, v2
; GCN-NEXT: v_add_f32_e32 v0, v22, v0
@@ -419,46 +438,27 @@
; GCN-NEXT: v_add_f32_e32 v0, v23, v0
; GCN-NEXT: v_add_f32_e32 v0, v25, v0
; GCN-NEXT: v_add_f32_e32 v0, v27, v0
- ; GCN-NEXT: v_fma_f32 v8, s4, v8, -v72
; GCN-NEXT: v_add_f32_e32 v0, v65, v0
- ; GCN-NEXT: v_fma_f32 v9, s4, v9, -v72
- ; GCN-NEXT: v_mul_f32_e32 v8, 0x3fb8aa3b, v8
; GCN-NEXT: v_add_f32_e32 v0, v68, v0
- ; GCN-NEXT: v_fma_f32 v11, s4, v11, -v72
- ; GCN-NEXT: v_mul_f32_e32 v9, 0x3fb8aa3b, v9
- ; GCN-NEXT: v_fma_f32 v12, s4, v12, -v72
- ; GCN-NEXT: v_fma_f32 v13, s4, v13, -v72
- ; GCN-NEXT: v_exp_f32_e32 v8, v8
; GCN-NEXT: v_add_f32_e32 v0, v24, v0
- ; GCN-NEXT: v_fma_f32 v5, s4, v14, -v72
- ; GCN-NEXT: v_exp_f32_e32 v9, v9
; GCN-NEXT: v_add_f32_e32 v0, v26, v0
; GCN-NEXT: v_add_f32_e32 v0, v67, v0
- ; GCN-NEXT: v_fma_f32 v14, s4, v15, -v72
- ; GCN-NEXT: v_mul_f32_e32 v11, 0x3fb8aa3b, v11
- ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v12
; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v5
; GCN-NEXT: v_add_f32_e32 v0, v6, v0
- ; GCN-NEXT: v_exp_f32_e32 v11, v11
- ; GCN-NEXT: v_cvt_f16_f32_e32 v4, v8
- ; GCN-NEXT: v_exp_f32_e32 v12, v3
- ; GCN-NEXT: v_mul_f32_e32 v3, 0x3fb8aa3b, v13
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10
; GCN-NEXT: v_exp_f32_e32 v17, v1
; GCN-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v14
; GCN-NEXT: v_add_f32_e32 v0, v7, v0
- ; GCN-NEXT: v_cvt_f16_f32_e32 v13, v9
- ; GCN-NEXT: v_exp_f32_e32 v15, v3
+ ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11
; GCN-NEXT: v_exp_f32_e32 v18, v1
; GCN-NEXT: v_add_f32_e32 v6, v8, v0
; GCN-NEXT: ds_read_b128 v[0:3], v91
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: buffer_inv sc0 sc1
- ; GCN-NEXT: v_cvt_f16_f32_e32 v5, v10
- ; GCN-NEXT: v_cvt_f16_f32_e32 v14, v11
; GCN-NEXT: v_add_f32_e32 v6, v9, v6
+ ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14
; GCN-NEXT: v_pack_b32_f16 v8, v4, v13
; GCN-NEXT: v_add_f32_e32 v6, v10, v6
- ; GCN-NEXT: v_pack_b32_f16 v9, v5, v14
; GCN-NEXT: v_cvt_f16_f32_e32 v7, v18
; GCN-NEXT: v_cvt_f16_f32_e32 v10, v15
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[0:1], v[8:9], v[48:63]
@@ -478,13 +478,13 @@
; GCN-NEXT: s_waitcnt vmcnt(8)
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: v_mov_b32_e32 v4, 0
- ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[48:63], v[2:3], v[0:1], v[48:63]
; GCN-NEXT: v_add_f32_e32 v2, v18, v11
; GCN-NEXT: ds_bpermute_b32 v3, v66, v2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_add_f32_e32 v2, v2, v3
; GCN-NEXT: ds_bpermute_b32 v3, v66, v2
+ ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[32:47], v[6:7], v[0:1], v[32:47]
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[0:1]
; GCN-NEXT: v_fmac_f32_e32 v2, v4, v16
diff --git a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
new file mode 100644
index 0000000000000..8d8f31c69b64a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
@@ -0,0 +1,49 @@
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -o - %s 2>&1 | FileCheck -check-prefix=GCN %s
+
+--- |
+ define amdgpu_kernel void @smallInterleave() { ret void }
+ ; GCN-LABEL: SU(3): renamable $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 5
+ ; GCN-NEXT: # succs left : 2
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 1
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(2): Data Latency=0 Reg=$vgpr2_vgpr3
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
+ ; GCN-NEXT: Successors:
+ ; GCN-NEXT: SU(5): Data Latency=1 Reg=$vgpr0_vgpr1
+ ; GCN-NEXT: SU(4): Anti Latency=0
+
+ ; GCN-LABEL: smallInterleave:
+ ; GCN: ; %bb.0:
+ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+ ; GCN-NEXT: v_mov_b32_e32 v0, 0
+ ; GCN-NEXT: v_mov_b32_e32 v1, 1
+ ; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3
+ ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0]
+ ; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3
+ ; GCN-NEXT: buffer_wbl2 sc0 sc1
+ ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] sc0 sc1
+ ; GCN-NEXT: s_endpgm
+...
+
+---
+name: smallInterleave
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ undef %0.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
+ undef %0.sub1:vreg_64_align2 = V_MOV_B32_e32 1, implicit $exec
+ %2:vreg_64_align2 = IMPLICIT_DEF
+ %3:vreg_64_align2 = contract nofpexcept V_PK_MUL_F32 8, %2:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ %4:vreg_64_align2 = IMPLICIT_DEF
+ FLAT_STORE_DWORDX2 undef %4:vreg_64_align2, %3:vreg_64_align2, 0, 0, implicit $exec, implicit $flat_scr
+ S_ENDPGM 0
+...
>From dde69bbf051977591fbb6736526a070c1e5a3e03 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Fri, 16 May 2025 14:27:34 -0500
Subject: [PATCH 2/4] implement feedback part 1
---
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 19 ++++--
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 11 +---
.../CodeGen/AMDGPU/packed-dependencies.mir | 58 +++++++------------
3 files changed, 38 insertions(+), 50 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 9845e77c64a63..3061a5cf704fa 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -631,11 +631,23 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
bool GCNSubtarget::isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
MachineInstr *UseI,
int UseOpIdx) const {
+ // From the (gfx942, for example) ISA:
+ // "Packed 32-bit instructions operate on 2 dwords at a time and those
+ // operands must be two-dword aligned (i.e. an even VGPR address). Output
+ // modifiers are not supported for these instructions. OPSEL and OPSEL_HI work
+ // to select the first or second DWORD for each source."
+ // -> We can save dependencies on VGPRs by analyzing the operand selection.
+ // See also
+ // https://llvm.org/docs/AMDGPUModifierSyntax.html#amdgpu-synid-op-sel
+
if (!InstrInfo.isVOP3P(*UseI))
return true;
MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
if (!DefOp.isReg() || !DefOp.getReg().isPhysical())
return true;
+ MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
+ if (!UseOp.isReg() || !UseOp.getReg().isPhysical())
+ return true;
AMDGPU::OpName UseModName;
if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(), AMDGPU::OpName::src0) ==
@@ -654,14 +666,11 @@ bool GCNSubtarget::isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
return true;
// Check whether all parts of the register are being used (= op_sel and
// op_sel_hi differ). In that case we can return early.
- auto OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
- auto OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
+ int64_t OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
+ int64_t OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
if ((!OpSel || !OpSelHi) && (OpSel || OpSelHi))
return true;
- MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
- if (!UseOp.isReg() || !UseOp.getReg().isPhysical())
- return true;
const SIRegisterInfo *TRI = getRegisterInfo();
const MachineRegisterInfo &MRI = UseI->getParent()->getParent()->getRegInfo();
MCRegister DefReg = DefOp.getReg().asMCReg();
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 140931e8e91cc..d0444485def0c 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -293,14 +293,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
SITargetLowering TLInfo;
SIFrameLowering FrameLowering;
- /// From the (MI300) ISA:
- /// "Packed 32-bit instructions operate on 2 dwords at a time and those
- /// operands must be two-dword aligned (i.e. an even VGPR address). Output
- /// modifiers are not supported for these instructions. OPSEL and OPSEL_HI
- /// work to select the first or second DWORD for each source."
- /// -> We can save dependencies on VGPRs by analyzing the operand selection.
- /// See also
- /// https://llvm.org/docs/AMDGPUModifierSyntax.html#amdgpu-synid-op-sel
+ /// Check whether there is a real dependency between the definition and the
+ /// use. The definition might only affect a subregister that is not actually
+ /// used.
bool isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
MachineInstr *UseI, int UseOpIdx) const;
diff --git a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
index 8d8f31c69b64a..f2b088b8c108d 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
+++ b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
@@ -1,49 +1,33 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -o - %s 2>&1 | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -stop-after=machine-scheduler -filetype=null %s 2>&1 | FileCheck -check-prefix=GCN %s
--- |
- define amdgpu_kernel void @smallInterleave() { ret void }
- ; GCN-LABEL: SU(3): renamable $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: # preds left : 5
- ; GCN-NEXT: # succs left : 2
- ; GCN-NEXT: # rdefs left : 0
- ; GCN-NEXT: Latency : 1
- ; GCN-NEXT: Depth : 1
- ; GCN-NEXT: Height : 1
- ; GCN-NEXT: Predecessors:
- ; GCN-NEXT: SU(2): Data Latency=0 Reg=$vgpr2_vgpr3
- ; GCN-NEXT: SU(1): Out Latency=1
- ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
- ; GCN-NEXT: SU(0): Out Latency=1
- ; GCN-NEXT: SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
- ; GCN-NEXT: Successors:
- ; GCN-NEXT: SU(5): Data Latency=1 Reg=$vgpr0_vgpr1
- ; GCN-NEXT: SU(4): Anti Latency=0
-
- ; GCN-LABEL: smallInterleave:
- ; GCN: ; %bb.0:
- ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
- ; GCN-NEXT: v_mov_b32_e32 v0, 0
- ; GCN-NEXT: v_mov_b32_e32 v1, 1
- ; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3
- ; GCN-NEXT: v_pk_mul_f32 v[0:1], v[2:3], v[0:1] op_sel_hi:[1,0]
- ; GCN-NEXT: ; implicit-def: $vgpr2_vgpr3
- ; GCN-NEXT: buffer_wbl2 sc0 sc1
- ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] sc0 sc1
- ; GCN-NEXT: s_endpgm
+ define amdgpu_kernel void @check_subreg_dep() { ret void }
+ ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, undef $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 1
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 1
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
+ ; GCN-NEXT: Successors:
+ ; GCN-NEXT: SU(3): Data Latency=1 Reg=$vgpr0_vgpr1
...
---
-name: smallInterleave
+name: check_subreg_dep
tracksRegLiveness: true
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
- undef %0.sub0:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec
- undef %0.sub1:vreg_64_align2 = V_MOV_B32_e32 1, implicit $exec
- %2:vreg_64_align2 = IMPLICIT_DEF
- %3:vreg_64_align2 = contract nofpexcept V_PK_MUL_F32 8, %2:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- %4:vreg_64_align2 = IMPLICIT_DEF
- FLAT_STORE_DWORDX2 undef %4:vreg_64_align2, %3:vreg_64_align2, 0, 0, implicit $exec, implicit $flat_scr
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr1 = V_MOV_B32_e32 1, implicit $exec
+ $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, undef $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ FLAT_STORE_DWORDX2 undef $vgpr2_vgpr3, $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
S_ENDPGM 0
...
>From d981ba6c0ab278dfa98a05391f80765a323779a1 Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Mon, 11 Aug 2025 09:14:21 -0500
Subject: [PATCH 3/4] implement feedback; improve testing
---
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 182 +++-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 14 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.h | 20 +
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 4 +-
.../CodeGen/AMDGPU/packed-dependencies.mir | 966 +++++++++++++++++-
5 files changed, 1112 insertions(+), 74 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 3061a5cf704fa..7c9ecaf9e2ef7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -628,69 +628,136 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
return std::pair(MaxNumVGPRs, MaxNumAGPRs);
}
-bool GCNSubtarget::isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
- MachineInstr *UseI,
- int UseOpIdx) const {
- // From the (gfx942, for example) ISA:
- // "Packed 32-bit instructions operate on 2 dwords at a time and those
- // operands must be two-dword aligned (i.e. an even VGPR address). Output
- // modifiers are not supported for these instructions. OPSEL and OPSEL_HI work
- // to select the first or second DWORD for each source."
- // -> We can save dependencies on VGPRs by analyzing the operand selection.
- // See also
- // https://llvm.org/docs/AMDGPUModifierSyntax.html#amdgpu-synid-op-sel
-
- if (!InstrInfo.isVOP3P(*UseI))
- return true;
- MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
- if (!DefOp.isReg() || !DefOp.getReg().isPhysical())
- return true;
- MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
- if (!UseOp.isReg() || !UseOp.getReg().isPhysical())
- return true;
-
+// Check to which source operand UseOpIdx points to and return a pointer to the
+// operand of the corresponding source modifier.
+// Return nullptr if UseOpIdx either doesn't point to src0/1/2 or if there is no
+// operand for the corresponding source modifier.
+static MachineOperand *
+getVOP3PSourceModifierFromOpIdx(MachineInstr *UseI, int UseOpIdx,
+ const SIInstrInfo &InstrInfo) {
AMDGPU::OpName UseModName;
- if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(), AMDGPU::OpName::src0) ==
- UseOpIdx)
+ unsigned UseOpcode = UseI->getOpcode();
+ if (AMDGPU::getNamedOperandIdx(UseOpcode, AMDGPU::OpName::src0) == UseOpIdx)
UseModName = AMDGPU::OpName::src0_modifiers;
- else if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(),
- AMDGPU::OpName::src1) == UseOpIdx)
+ else if (AMDGPU::getNamedOperandIdx(UseOpcode, AMDGPU::OpName::src1) ==
+ UseOpIdx)
UseModName = AMDGPU::OpName::src1_modifiers;
- else if (AMDGPU::getNamedOperandIdx(UseI->getOpcode(),
- AMDGPU::OpName::src2) == UseOpIdx)
+ else if (AMDGPU::getNamedOperandIdx(UseOpcode, AMDGPU::OpName::src2) ==
+ UseOpIdx)
UseModName = AMDGPU::OpName::src2_modifiers;
else
- return true;
- MachineOperand *UseOpMod = InstrInfo.getNamedOperand(*UseI, UseModName);
+ return nullptr;
+ return InstrInfo.getNamedOperand(*UseI, UseModName);
+}
+
+// Get the subreg idx of the subreg that is used by the given VOP3P instruction
+// operand, considering the given op_sel and op_sel_hi modifiers.
+static unsigned getUsedVOP3PSubRegIdx(const SIRegisterInfo *TRI,
+ const MachineRegisterInfo &MRI,
+ const SIInstrInfo &InstrInfo,
+ const MachineOperand &Op, int64_t OpSel,
+ int64_t OpSelHi) {
+ unsigned RegSize;
+
+ if (InstrInfo.isVOP3PMix(*Op.getParent()))
+ RegSize = OpSelHi ? 32 : 64;
+ else if (unsigned SubRegIdx = Op.getSubReg())
+ RegSize = TRI->getSubRegIdxSize(SubRegIdx);
+ else
+ RegSize = TRI->getRegSizeInBits(Op.getReg(), MRI);
+
+ assert((RegSize == 64 || RegSize == 32) && "unexpected VOP3P operand size");
+
+ switch (RegSize) {
+ case 32:
+ return OpSel ? AMDGPU::hi16 : AMDGPU::lo16;
+ case 64:
+ return OpSel ? AMDGPU::sub1 : AMDGPU::sub0;
+ default:
+ llvm::reportFatalInternalError("currently unsupported VOP3P operand size");
+ }
+}
+
+std::pair<bool, std::optional<Register>>
+GCNSubtarget::getRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
+ MachineInstr *UseI, int UseOpIdx) const {
+ if (!InstrInfo.isVOP3P(*UseI) || InstrInfo.isWMMA(*UseI) ||
+ InstrInfo.isSWMMAC(*UseI))
+ return {true, std::nullopt};
+
+ MachineOperand *UseOpMod =
+ getVOP3PSourceModifierFromOpIdx(UseI, UseOpIdx, InstrInfo);
if (!UseOpMod)
- return true;
- // Check whether all parts of the register are being used (= op_sel and
- // op_sel_hi differ). In that case we can return early.
- int64_t OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
- int64_t OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
- if ((!OpSel || !OpSelHi) && (OpSel || OpSelHi))
- return true;
+ return {true, std::nullopt};
+
+ MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
+ MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
+ if (!UseOp.isReg())
+ return {true, std::nullopt};
+ Register DefReg = DefOp.getReg();
+ Register UseReg = UseOp.getReg();
+
+ bool IsVirtual = DefReg.isVirtual() && UseReg.isVirtual();
+ assert((IsVirtual || (DefReg.isPhysical() && UseReg.isPhysical())) &&
+ "register virtual/physical mismatch");
const SIRegisterInfo *TRI = getRegisterInfo();
const MachineRegisterInfo &MRI = UseI->getParent()->getParent()->getRegInfo();
- MCRegister DefReg = DefOp.getReg().asMCReg();
- MCRegister UseReg = UseOp.getReg().asMCReg();
- // We specifically look for a packed 32bit Use and smaller Def.
- if (TRI->getRegSizeInBits(UseReg, MRI) != 64 ||
- TRI->getRegSizeInBits(DefReg, MRI) > 32)
- return true;
- SmallVector<MCRegUnit, 2> DefRegUnits(TRI->regunits(DefReg));
- assert(DefRegUnits.size() <= 2 && "unexpected number of register units");
- SmallVector<MCRegUnit, 4> UseRegUnits(TRI->regunits(UseReg));
- assert(UseRegUnits.size() == 4 && "unexpected number of register units");
-
- auto FindRegunit = [&DefRegUnits](MCRegUnit A, MCRegUnit B) {
- return llvm::find_if(DefRegUnits, [A, B](MCRegUnit RU) {
- return RU == A || RU == B;
- }) != DefRegUnits.end();
- };
- return OpSel ? FindRegunit(UseRegUnits[2], UseRegUnits[3])
- : FindRegunit(UseRegUnits[0], UseRegUnits[1]);
+
+ // Note: the FMA_MIX* and MAD_MIX* instructions have different semantics for
+ // the op_sel and op_sel_hi source modifiers:
+ // - op_sel: selects low/high operand bits as input to the operation;
+ // has only meaning for 16-bit source operands
+ // - op_sel_hi: specifies the size of the source operands (16 or 32 bits);
+ // a value of 0 indicates 32 bit, 1 indicates 16 bit
+ // For the other VOP3P instructions, the semantics are:
+ // - op_sel: selects low/high operand bits as input to the operation which
+ // results in the lower-half of the destination
+ // - op_sel_hi: selects the low/high operand bits as input to the operation
+ // which results in the higher-half of the destination
+ int64_t OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
+ int64_t OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
+ // First, check if all parts of the register are being used (= op_sel and
+ // op_sel_hi differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we
+ // can return early.
+ if ((!InstrInfo.isVOP3PMix(*UseI) && (!OpSel || !OpSelHi) &&
+ (OpSel || OpSelHi)) ||
+ (InstrInfo.isVOP3PMix(*UseI) && !OpSelHi)) {
+ // An optimization we can still make: we can restrict the dependency to the
+ // smaller register. At least when we're dealing with physical registers.
+ // For virtual registers, we currently have to stick to the SSA value
+ // itself because we cannot construct a subreg for a virtual register.
+ bool IsDefSmaller = !IsVirtual && TRI->getRegSizeInBits(DefReg, MRI) <=
+ TRI->getRegSizeInBits(UseReg, MRI);
+ return {true, IsDefSmaller ? DefReg : UseReg};
+ }
+ // Otherwise, we now know that only one of two parts is being used. This
+ // allows us to return the subreg that is actually being used.
+ unsigned UseSubRegIdx =
+ getUsedVOP3PSubRegIdx(TRI, MRI, InstrInfo, UseOp, OpSel, OpSelHi);
+
+ bool IsRealDep;
+ if (IsVirtual) {
+ // If the definition isn't restricted to a sub-register, there is no point
+ // in further analysis. This check makes only sense for virtual registers
+ // because physical registers may form a tuple and thus be part of a
+ // superregister although they are not a subregister themselves (vgpr0 is a
+ // "subreg" of vgpr0_vgpr1 without being a subreg in itself).
+ unsigned DefSubRegIdx = DefOp.getSubReg();
+ if (!DefSubRegIdx)
+ return {true, std::nullopt};
+ // Get the subreg idx of the selected part of the use.
+ LaneBitmask DefLaneMask = TRI->getSubRegIndexLaneMask(DefSubRegIdx);
+ LaneBitmask UseLaneMask = TRI->getSubRegIndexLaneMask(UseSubRegIdx);
+ IsRealDep = (DefLaneMask & UseLaneMask).any();
+ } else {
+ assert(DefReg.isPhysical() && UseReg.isPhysical());
+
+ MCRegister UseMCReg = TRI->getSubReg(UseReg.asMCReg(), UseSubRegIdx);
+ IsRealDep = TRI->regsOverlap(UseMCReg, DefReg);
+ }
+
+ return {IsRealDep, IsRealDep ? std::optional(DefReg) : std::nullopt};
}
void GCNSubtarget::adjustSchedDependency(
@@ -703,7 +770,12 @@ void GCNSubtarget::adjustSchedDependency(
MachineInstr *DefI = Def->getInstr();
MachineInstr *UseI = Use->getInstr();
- if (!isRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx)) {
+ if (const auto &[IsRealDep, Reg] =
+ getRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx);
+ IsRealDep) {
+ if (Reg)
+ Dep.setReg(*Reg);
+ } else {
Dep = SDep(Def, SDep::Artificial);
return; // this is not a data dependency anymore
}
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d0444485def0c..ecd81e1db0777 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -294,10 +294,16 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
SIFrameLowering FrameLowering;
/// Check whether there is a real dependency between the definition and the
- /// use. The definition might only affect a subregister that is not actually
- /// used.
- bool isRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
- MachineInstr *UseI, int UseOpIdx) const;
+ /// use. The definition might only affect a subregister that is not actually
+ /// used. Works for both virtual and physical registers.
+ /// The bool part of the returned pair tells if we're dealing with a real
+ /// dependency.
+ /// The optional Register part of the returned pair holds the (sub)register
+ /// that is the actual dependency in cases we can determine that.
+ /// Note: WMMA an SWMMAC instructions are currently not supported.
+ std::pair<bool, std::optional<Register>>
+ getRealSchedDependency(MachineInstr *DefI, int DefOpIdx, MachineInstr *UseI,
+ int UseOpIdx) const;
public:
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fdbd9ce4a66bf..3103f9b30b8c2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -833,6 +833,26 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
}
+ bool isVOP3PMix(const MachineInstr &MI) const {
+ return isVOP3PMix(MI.getOpcode());
+ }
+
+ bool isVOP3PMix(uint16_t Opcode) const {
+ if (!isVOP3P(Opcode))
+ return false;
+ switch (Opcode) {
+ case AMDGPU::V_FMA_MIXHI_F16:
+ case AMDGPU::V_FMA_MIXLO_F16:
+ case AMDGPU::V_FMA_MIX_F32:
+ case AMDGPU::V_MAD_MIXHI_F16:
+ case AMDGPU::V_MAD_MIXLO_F16:
+ case AMDGPU::V_MAD_MIX_F32:
+ return true;
+ default:
+ return false;
+ }
+ }
+
static bool isVINTRP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index ce280d484da1b..fd4b8070a0184 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -327,8 +327,8 @@ let SubtargetPredicate = HasMadMixInsts in {
let OtherPredicates = [NoFP32Denormals] in {
// These are VOP3a-like opcodes which accept no omod.
-// Size of src arguments (16/32) is controlled by op_sel.
-// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
+// Size of src arguments (16/32) is controlled by op_sel_hi.
+// For 16-bit src arguments their location (hi/lo) are controlled by op_sel.
let isCommutable = 1, mayRaiseFPException = 0 in {
let isReMaterializable = 1 in
defm V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3P_Mix_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
diff --git a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
index f2b088b8c108d..11ce1b0047a68 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
+++ b/llvm/test/CodeGen/AMDGPU/packed-dependencies.mir
@@ -1,33 +1,973 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx942 -start-before=machine-scheduler -verify-misched -misched-print-dags -stop-after=machine-scheduler -filetype=null %s 2>&1 | FileCheck -check-prefix=GCN %s
---- |
- define amdgpu_kernel void @check_subreg_dep() { ret void }
- ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, undef $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+# Note: the source modifier is the parameter before the source itself. So,
+# src0_modifiers is the parameter in the list before src0, src1_modifiers before
+# src1.
+# For the srcN_modifiers, the following values are relevant for these tests:
+# - 0: op_sel=0 and op_sel_hi=0
+# - 4: op_sel=1 and op_sel_hi=0
+# - 8: op_sel=0 and op_sel_hi=1
+# - 12: op_sel=1 and op_sel_hi=1
+# For every test where we test two register arguments, the size of the arguments
+# and the used parts are encoded in the test name. Examples:
+# - *_32_lo_lo_32_lo_lo: two args of size 32 where only the low parts are used
+# - *_16_lo_hi_16_hi_hi: two args of size 16 where both parts of the first arg
+# and the high part of the second arg are used
+# For a "(lo|hi)_(lo|hi)" pair, the first field denotes the part controlled by
+# op_sel, the second field the one controlled by op_sel_hi.
+#
+# For the mad_mix_* tests, op_sel and op_sel_hi have slightly different semantics:
+# - op_sel_hi: selects if the full 32bit of the arg should be used or only a
+# 16bit part (which is then selected by op_sel)
+# op_sel_hi=0 selects 32bit
+# op_sel_hi=1 selects 16bit
+# - op_sel: selects low/high part of arg
+# So, for the srcN_modifiers, we have the following values:
+# - 0: op_sel=0 and 32bit (op_sel_hi=0)
+# - 4: op_sel=1 and 32bit (op_sel_hi=0)
+# - 8: op_sel=0 and 16bit (op_sel_hi=1)
+# - 12: op_sel=1 and 16bit (op_sel_hi=1)
+
+---
+name: pk_mul_virtual_32_lo_lo_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_32_lo_lo_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_virtual_16_lo_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+ %0.hi16:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_16_lo_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
; GCN-NEXT: # preds left : 4
- ; GCN-NEXT: # succs left : 1
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_lo16 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_virtual_32_lo_lo_32_lo_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 8, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 3
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 0, %0:vreg_64_align2, 8, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_32_lo_lo_32_hi_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 4, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 5
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr1
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 0, $vgpr0_vgpr1, 4, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_virtual_16_lo_lo_16_lo_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 3
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+ %0.hi16:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_16_lo_lo_16_hi_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 4, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 5
+ ; GCN-NEXT: # succs left : 0
; GCN-NEXT: # rdefs left : 0
; GCN-NEXT: Latency : 1
; GCN-NEXT: Depth : 1
- ; GCN-NEXT: Height : 1
+ ; GCN-NEXT: Height : 0
; GCN-NEXT: Predecessors:
; GCN-NEXT: SU(1): Out Latency=1
; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16
; GCN-NEXT: SU(0): Out Latency=1
- ; GCN-NEXT: SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
- ; GCN-NEXT: Successors:
- ; GCN-NEXT: SU(3): Data Latency=1 Reg=$vgpr0_vgpr1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_lo16 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 4, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
...
---
-name: check_subreg_dep
+name: pk_mul_virtual_32_hi_lo_32_lo_lo
tracksRegLiveness: true
machineFunctionInfo:
stackPtrOffsetReg: '$sgpr32'
body: |
bb.0:
- $vgpr0 = V_MOV_B32_e32 0, implicit $exec
- $vgpr1 = V_MOV_B32_e32 1, implicit $exec
- $vgpr0_vgpr1 = contract nofpexcept V_PK_MUL_F32 8, undef $vgpr2_vgpr3, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
- FLAT_STORE_DWORDX2 undef $vgpr2_vgpr3, $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+ ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 4, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 3
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 4, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_32_lo_hi_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 5
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr1
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 8, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_virtual_16_hi_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 3
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+ %0.hi16:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_PK_MUL_F16 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_16_hi_lo_16_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 4, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 5
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_lo16 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_PK_MUL_F16 4, $vgpr0, 0, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_virtual_32_hi_hi_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 0, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_32_hi_hi_32_lo_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 6
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr1
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 0, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_virtual_16_hi_hi_16_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+ %0.hi16:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_16_hi_hi_16_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_lo16 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr0, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_virtual_32_hi_hi_32_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 12, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.sub0:vreg_64_align2 = IMPLICIT_DEF
+ %0.sub1:vreg_64_align2 = IMPLICIT_DEF
+ %1:vreg_64_align2 = nofpexcept V_PK_MUL_F32 12, %0:vreg_64_align2, 12, %0:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_32_hi_hi_32_hi_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr1 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr1
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr1 = IMPLICIT_DEF
+ $vgpr0_vgpr1 = nofpexcept V_PK_MUL_F32 12, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_virtual_16_lo_lo_16_lo_lo_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.sub1_hi16:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0.sub0:vreg_64_align2, 0, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ %0.sub1_hi16:vreg_64_align2 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_PK_MUL_F16 0, %0.sub0:vreg_64_align2, 0, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_16_lo_lo_16_lo_lo_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 2
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_vgpr1
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_PK_MUL_F16 0, $vgpr0, 0, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_virtual_16_hi_hi_16_hi_hi_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.sub1_lo16:vreg_64_align2 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0.sub0:vreg_64_align2, 12, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ %0:vreg_64_align2 = IMPLICIT_DEF
+ %0.sub1_lo16:vreg_64_align2 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_PK_MUL_F16 12, %0.sub0:vreg_64_align2, 12, %0.sub1:vreg_64_align2, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_16_hi_hi_16_hi_hi_superreg_definition
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 2
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_vgpr1
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: pk_mul_physical_16_hi_hi_16_hi_hi_superreg_definition1
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_vgpr1 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr1_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 3
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_vgpr1
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_vgpr1 = IMPLICIT_DEF
+ $vgpr1_lo16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_PK_MUL_F16 12, $vgpr0, 12, $vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_virtual_16_lo_16_lo_16_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0:vgpr_32, 8, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+ %0.hi16:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 8, %0:vgpr_32, 8, %0:vgpr_32, 8, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_physical_16_lo_16_lo_16_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_lo16 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_MAD_MIX_F32 8, $vgpr0, 8, $vgpr0, 8, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_virtual_16_hi_16_hi_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+ %0.hi16:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 12, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_physical_16_hi_16_hi_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 12, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_lo16 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 12, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_virtual_16_hi_16_lo_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 8, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+ %0.hi16:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 12, %0:vgpr_32, 8, %0:vgpr_32, 12, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_physical_16_hi_16_lo_16_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 8, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 6
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16
+ ; GCN-NEXT: SU(1): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Ord Latency=0 Artificial
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_lo16 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_MAD_MIX_F32 12, $vgpr0, 8, $vgpr0, 12, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_virtual_32_hi_32_hi_32_hi
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0.lo16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.hi16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 4, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 0
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ undef %0.lo16:vgpr_32 = IMPLICIT_DEF
+ %0.hi16:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 4, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_physical_32_lo_32_lo_32_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_hi16
+ ; GCN-NEXT: SU(0): Out Latency=1
+ ; GCN-NEXT: SU(0): Data Latency=0 Reg=$vgpr0_lo16
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0_lo16 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 0, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_virtual_32_hi_32_hi_32_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): %0:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): %0.lo16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): %0.hi16:vgpr_32 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(3): dead %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 2
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 1
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(2): Data Latency=0 Reg=%0
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=%0
+ ; GCN-NEXT: Pressure Diff
+ ;
+ %0:vgpr_32 = IMPLICIT_DEF
+ %0.lo16:vgpr_32 = IMPLICIT_DEF
+ %0.hi16:vgpr_32 = IMPLICIT_DEF
+ %1:vgpr_32 = nofpexcept V_MAD_MIX_F32 4, %0:vgpr_32, 4, %0:vgpr_32, 0, %0:vgpr_32, 0, 0, 0, implicit $mode, implicit $exec
+ S_ENDPGM 0
+...
+
+---
+name: mad_mix_physical_32_lo_32_hi_32_lo
+tracksRegLiveness: true
+machineFunctionInfo:
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ bb.0:
+ ; GCN-LABEL: SU(0): $vgpr0 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(1): $vgpr0_lo16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(2): $vgpr0_hi16 = IMPLICIT_DEF
+ ; GCN-LABEL: SU(3): $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 4, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
+ ; GCN-NEXT: # preds left : 4
+ ; GCN-NEXT: # succs left : 0
+ ; GCN-NEXT: # rdefs left : 0
+ ; GCN-NEXT: Latency : 1
+ ; GCN-NEXT: Depth : 2
+ ; GCN-NEXT: Height : 0
+ ; GCN-NEXT: Predecessors:
+ ; GCN-NEXT: SU(2): Out Latency=1
+ ; GCN-NEXT: SU(2): Data Latency=0 Reg=$vgpr0_hi16
+ ; GCN-NEXT: SU(1): Out Latency=1
+ ; GCN-NEXT: SU(1): Data Latency=0 Reg=$vgpr0_lo16
+ ; GCN-NEXT: Pressure Diff
+ ;
+ $vgpr0 = IMPLICIT_DEF
+ $vgpr0_lo16 = IMPLICIT_DEF
+ $vgpr0_hi16 = IMPLICIT_DEF
+ $vgpr0 = nofpexcept V_MAD_MIX_F32 0, $vgpr0, 4, $vgpr0, 0, $vgpr0, 0, 0, 0, implicit $mode, implicit $exec
S_ENDPGM 0
...
>From 512c730578cb6ee57d6f8ace72920a259394ea3e Mon Sep 17 00:00:00 2001
From: Robert Imschweiler <robert.imschweiler at amd.com>
Date: Mon, 1 Sep 2025 04:35:50 -0500
Subject: [PATCH 4/4] implement feedback
---
.../include/llvm/CodeGen/TargetRegisterInfo.h | 22 ++
llvm/lib/MC/MCRegisterInfo.cpp | 5 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 191 ++++++++----------
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 19 +-
.../CodeGen/AMDGPU/calling-conventions.ll | 45 ++---
llvm/test/CodeGen/AMDGPU/fmed3.ll | 6 +-
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll | 2 +-
.../AMDGPU/sched-image-sample-post-RA.mir | 2 +-
.../CodeGen/AMDGPU/schedule-physregdeps.mir | 8 +-
llvm/test/CodeGen/AMDGPU/scratch-simple.ll | 102 +++++-----
10 files changed, 201 insertions(+), 201 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 12ee51f130262..eae0e66c23500 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -467,6 +467,28 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
return false;
}
+ /// Returns true if the two subregisters are equal or overlap.
+ /// The registers may be virtual registers.
+ bool subRegsOverlap(Register RegA, unsigned SubA, Register RegB,
+ unsigned SubB) const {
+ if (RegA == RegB && SubA == SubB)
+ return true;
+ if (RegA.isVirtual() && RegB.isVirtual()) {
+ if (RegA != RegB)
+ return false;
+ LaneBitmask LA = getSubRegIndexLaneMask(SubA);
+ LaneBitmask LB = getSubRegIndexLaneMask(SubB);
+ return (LA & LB).any();
+ }
+ if (RegA.isPhysical() && RegB.isPhysical()) {
+ RegA = getSubReg(RegA.asMCReg(), SubA);
+ RegB = getSubReg(RegB.asMCReg(), SubB);
+ assert(RegB.isValid() && RegA.isValid() && "invalid subregister");
+ return MCRegisterInfo::regsOverlap(RegA.asMCReg(), RegB.asMCReg());
+ }
+ return false;
+ }
+
/// Returns true if Reg contains RegUnit.
bool hasRegUnit(MCRegister Reg, MCRegUnit RegUnit) const {
return llvm::is_contained(regunits(Reg), RegUnit);
diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp
index ba9ef00f9f0d8..c76aed2adda8e 100644
--- a/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/llvm/lib/MC/MCRegisterInfo.cpp
@@ -114,8 +114,9 @@ MCRegisterInfo::getMatchingSuperReg(MCRegister Reg, unsigned SubIdx,
}
MCRegister MCRegisterInfo::getSubReg(MCRegister Reg, unsigned Idx) const {
- assert(Idx && Idx < getNumSubRegIndices() &&
- "This is not a subregister index");
+ if (!Idx)
+ return Reg;
+ assert(Idx < getNumSubRegIndices() && "This is not a subregister index");
// Get a pointer to the corresponding SubRegIndices list. This list has the
// name of each sub-register in the same order as MCSubRegIterator.
const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 7c9ecaf9e2ef7..7aad3b131a4e4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -632,77 +632,42 @@ GCNSubtarget::getMaxNumVectorRegs(const Function &F) const {
// operand of the corresponding source modifier.
// Return nullptr if UseOpIdx either doesn't point to src0/1/2 or if there is no
// operand for the corresponding source modifier.
-static MachineOperand *
-getVOP3PSourceModifierFromOpIdx(MachineInstr *UseI, int UseOpIdx,
+static const MachineOperand *
+getVOP3PSourceModifierFromOpIdx(const MachineInstr *UseI, int UseOpIdx,
const SIInstrInfo &InstrInfo) {
AMDGPU::OpName UseModName;
- unsigned UseOpcode = UseI->getOpcode();
- if (AMDGPU::getNamedOperandIdx(UseOpcode, AMDGPU::OpName::src0) == UseOpIdx)
+ AMDGPU::OpName UseName =
+ AMDGPU::getOperandIdxName(UseI->getOpcode(), UseOpIdx);
+ switch (UseName) {
+ case AMDGPU::OpName::src0:
UseModName = AMDGPU::OpName::src0_modifiers;
- else if (AMDGPU::getNamedOperandIdx(UseOpcode, AMDGPU::OpName::src1) ==
- UseOpIdx)
+ break;
+ case AMDGPU::OpName::src1:
UseModName = AMDGPU::OpName::src1_modifiers;
- else if (AMDGPU::getNamedOperandIdx(UseOpcode, AMDGPU::OpName::src2) ==
- UseOpIdx)
+ break;
+ case AMDGPU::OpName::src2:
UseModName = AMDGPU::OpName::src2_modifiers;
- else
+ break;
+ default:
return nullptr;
+ }
return InstrInfo.getNamedOperand(*UseI, UseModName);
}
-// Get the subreg idx of the subreg that is used by the given VOP3P instruction
-// operand, considering the given op_sel and op_sel_hi modifiers.
-static unsigned getUsedVOP3PSubRegIdx(const SIRegisterInfo *TRI,
- const MachineRegisterInfo &MRI,
+// Get the subreg idx of the subreg that is used by the given instruction
+// operand, considering the given op_sel modifier.
+// Return 0 if the whole register is used or as a conservative fallback.
+static unsigned getEffectiveSubRegIdx(const SIRegisterInfo *TRI,
const SIInstrInfo &InstrInfo,
- const MachineOperand &Op, int64_t OpSel,
- int64_t OpSelHi) {
- unsigned RegSize;
-
- if (InstrInfo.isVOP3PMix(*Op.getParent()))
- RegSize = OpSelHi ? 32 : 64;
- else if (unsigned SubRegIdx = Op.getSubReg())
- RegSize = TRI->getSubRegIdxSize(SubRegIdx);
- else
- RegSize = TRI->getRegSizeInBits(Op.getReg(), MRI);
-
- assert((RegSize == 64 || RegSize == 32) && "unexpected VOP3P operand size");
-
- switch (RegSize) {
- case 32:
- return OpSel ? AMDGPU::hi16 : AMDGPU::lo16;
- case 64:
- return OpSel ? AMDGPU::sub1 : AMDGPU::sub0;
- default:
- llvm::reportFatalInternalError("currently unsupported VOP3P operand size");
- }
-}
-
-std::pair<bool, std::optional<Register>>
-GCNSubtarget::getRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
- MachineInstr *UseI, int UseOpIdx) const {
- if (!InstrInfo.isVOP3P(*UseI) || InstrInfo.isWMMA(*UseI) ||
- InstrInfo.isSWMMAC(*UseI))
- return {true, std::nullopt};
+ const MachineOperand &Op) {
+ const MachineInstr *I = Op.getParent();
+ if (!InstrInfo.isVOP3P(*I) || InstrInfo.isWMMA(*I) || InstrInfo.isSWMMAC(*I))
+ return 0;
- MachineOperand *UseOpMod =
- getVOP3PSourceModifierFromOpIdx(UseI, UseOpIdx, InstrInfo);
- if (!UseOpMod)
- return {true, std::nullopt};
-
- MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
- MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
- if (!UseOp.isReg())
- return {true, std::nullopt};
- Register DefReg = DefOp.getReg();
- Register UseReg = UseOp.getReg();
-
- bool IsVirtual = DefReg.isVirtual() && UseReg.isVirtual();
- assert((IsVirtual || (DefReg.isPhysical() && UseReg.isPhysical())) &&
- "register virtual/physical mismatch");
-
- const SIRegisterInfo *TRI = getRegisterInfo();
- const MachineRegisterInfo &MRI = UseI->getParent()->getParent()->getRegInfo();
+ const MachineOperand *OpMod =
+ getVOP3PSourceModifierFromOpIdx(I, Op.getOperandNo(), InstrInfo);
+ if (!OpMod)
+ return 0;
// Note: the FMA_MIX* and MAD_MIX* instructions have different semantics for
// the op_sel and op_sel_hi source modifiers:
@@ -715,49 +680,68 @@ GCNSubtarget::getRealSchedDependency(MachineInstr *DefI, int DefOpIdx,
// results in the lower-half of the destination
// - op_sel_hi: selects the low/high operand bits as input to the operation
// which results in the higher-half of the destination
- int64_t OpSel = UseOpMod->getImm() & SISrcMods::OP_SEL_0;
- int64_t OpSelHi = UseOpMod->getImm() & SISrcMods::OP_SEL_1;
- // First, check if all parts of the register are being used (= op_sel and
- // op_sel_hi differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we
- // can return early.
- if ((!InstrInfo.isVOP3PMix(*UseI) && (!OpSel || !OpSelHi) &&
+ int64_t OpSel = OpMod->getImm() & SISrcMods::OP_SEL_0;
+ int64_t OpSelHi = OpMod->getImm() & SISrcMods::OP_SEL_1;
+
+ // Check if all parts of the register are being used (= op_sel and op_sel_hi
+ // differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we can return
+ // early.
+ if ((!InstrInfo.isVOP3PMix(*I) && (!OpSel || !OpSelHi) &&
(OpSel || OpSelHi)) ||
- (InstrInfo.isVOP3PMix(*UseI) && !OpSelHi)) {
- // An optimization we can still make: we can restrict the dependency to the
- // smaller register. At least when we're dealing with physical registers.
- // For virtual registers, we currently have to stick to the SSA value
- // itself because we cannot construct a subreg for a virtual register.
- bool IsDefSmaller = !IsVirtual && TRI->getRegSizeInBits(DefReg, MRI) <=
- TRI->getRegSizeInBits(UseReg, MRI);
- return {true, IsDefSmaller ? DefReg : UseReg};
- }
- // Otherwise, we now know that only one of two parts is being used. This
- // allows us to return the subreg that is actually being used.
- unsigned UseSubRegIdx =
- getUsedVOP3PSubRegIdx(TRI, MRI, InstrInfo, UseOp, OpSel, OpSelHi);
-
- bool IsRealDep;
- if (IsVirtual) {
- // If the definition isn't restricted to a sub-register, there is no point
- // in further analysis. This check makes only sense for virtual registers
- // because physical registers may form a tuple and thus be part of a
- // superregister although they are not a subregister themselves (vgpr0 is a
- // "subreg" of vgpr0_vgpr1 without being a subreg in itself).
- unsigned DefSubRegIdx = DefOp.getSubReg();
- if (!DefSubRegIdx)
- return {true, std::nullopt};
- // Get the subreg idx of the selected part of the use.
- LaneBitmask DefLaneMask = TRI->getSubRegIndexLaneMask(DefSubRegIdx);
- LaneBitmask UseLaneMask = TRI->getSubRegIndexLaneMask(UseSubRegIdx);
- IsRealDep = (DefLaneMask & UseLaneMask).any();
- } else {
- assert(DefReg.isPhysical() && UseReg.isPhysical());
+ (InstrInfo.isVOP3PMix(*I) && !OpSelHi))
+ return 0;
- MCRegister UseMCReg = TRI->getSubReg(UseReg.asMCReg(), UseSubRegIdx);
- IsRealDep = TRI->regsOverlap(UseMCReg, DefReg);
- }
+ const TargetRegisterClass *RC =
+ InstrInfo.getOpRegClass(*I, Op.getOperandNo());
+
+ if (unsigned SubRegIdx = OpSel ? AMDGPU::sub1 : AMDGPU::sub0;
+ TRI->getSubRegisterClass(RC, SubRegIdx))
+ return SubRegIdx;
+ if (unsigned SubRegIdx = OpSel ? AMDGPU::hi16 : AMDGPU::lo16;
+ TRI->getSubRegisterClass(RC, SubRegIdx))
+ return SubRegIdx;
+
+ return 0;
+}
+
+Register GCNSubtarget::getRealSchedDependency(const MachineInstr *DefI,
+ int DefOpIdx,
+ const MachineInstr *UseI,
+ int UseOpIdx) const {
+ const SIRegisterInfo *TRI = getRegisterInfo();
+ const MachineOperand &DefOp = DefI->getOperand(DefOpIdx);
+ const MachineOperand &UseOp = UseI->getOperand(UseOpIdx);
+ Register DefReg = DefOp.getReg();
+ Register UseReg = UseOp.getReg();
- return {IsRealDep, IsRealDep ? std::optional(DefReg) : std::nullopt};
+ // If the registers aren't restricted to a sub-register, there is no point in
+ // further analysis. This check makes only sense for virtual registers because
+ // physical registers may form a tuple and thus be part of a superregister
+ // although they are not a subregister themselves (vgpr0 is a "subreg" of
+ // vgpr0_vgpr1 without being a subreg in itself).
+ unsigned DefSubRegIdx = DefOp.getSubReg();
+ if (DefReg.isVirtual() && !DefSubRegIdx)
+ return DefReg;
+ unsigned UseSubRegIdx = getEffectiveSubRegIdx(TRI, InstrInfo, UseOp);
+ if (UseReg.isVirtual() && !UseSubRegIdx)
+ return DefReg;
+
+ if (!TRI->subRegsOverlap(DefReg, DefSubRegIdx, UseReg, UseSubRegIdx))
+ return 0; // no real dependency
+
+ // UseReg might be smaller or larger than DefReg, depending on the subreg and
+ // on whether DefReg is a subreg, too. -> Find the smaller one. This does not
+ // apply to virtual registers because we cannot construct a subreg for them.
+ if (DefReg.isVirtual())
+ return DefReg;
+ MCRegister DefMCReg = TRI->getSubReg(DefReg.asMCReg(), DefSubRegIdx);
+ MCRegister UseMCReg = TRI->getSubReg(UseReg.asMCReg(), UseSubRegIdx);
+ const TargetRegisterClass *DefRC = TRI->getPhysRegBaseClass(DefMCReg);
+ const TargetRegisterClass *UseRC = TRI->getPhysRegBaseClass(UseMCReg);
+ // Some registers, such as SGPR[0-9]+_HI16, do not have a register class.
+ if (!DefRC || !UseRC)
+ return DefReg;
+ return DefRC->hasSubClass(UseRC) ? UseMCReg : DefMCReg;
}
void GCNSubtarget::adjustSchedDependency(
@@ -770,11 +754,8 @@ void GCNSubtarget::adjustSchedDependency(
MachineInstr *DefI = Def->getInstr();
MachineInstr *UseI = Use->getInstr();
- if (const auto &[IsRealDep, Reg] =
- getRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx);
- IsRealDep) {
- if (Reg)
- Dep.setReg(*Reg);
+ if (Register Reg = getRealSchedDependency(DefI, DefOpIdx, UseI, UseOpIdx)) {
+ Dep.setReg(Reg);
} else {
Dep = SDep(Def, SDep::Artificial);
return; // this is not a data dependency anymore
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ecd81e1db0777..6187d9aef3ea7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -293,17 +293,14 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
SITargetLowering TLInfo;
SIFrameLowering FrameLowering;
- /// Check whether there is a real dependency between the definition and the
- /// use. The definition might only affect a subregister that is not actually
- /// used. Works for both virtual and physical registers.
- /// The bool part of the returned pair tells if we're dealing with a real
- /// dependency.
- /// The optional Register part of the returned pair holds the (sub)register
- /// that is the actual dependency in cases we can determine that.
- /// Note: WMMA an SWMMAC instructions are currently not supported.
- std::pair<bool, std::optional<Register>>
- getRealSchedDependency(MachineInstr *DefI, int DefOpIdx, MachineInstr *UseI,
- int UseOpIdx) const;
+ /// Get the register that represents the actual dependency between the
+ /// definition and the use. The definition might only affect a subregister
+ /// that is not actually used. Works for both virtual and physical registers.
+ /// Note: Currently supports VOP3P instructions (without WMMA an SWMMAC).
+ /// Returns the definition register if there is a real dependency and no
+ /// better match is found.
+ Register getRealSchedDependency(const MachineInstr *DefI, int DefOpIdx,
+ const MachineInstr *UseI, int UseOpIdx) const;
public:
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index ddd3b1520bf5e..e600114235c2b 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -965,11 +965,11 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
;
; GFX11-LABEL: ps_mesa_inreg_v5i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: s_add_i32 s3, s3, 4
-; GFX11-NEXT: s_add_i32 s2, s2, 3
; GFX11-NEXT: s_add_i32 s1, s1, 2
; GFX11-NEXT: s_add_i32 s4, s4, 5
; GFX11-NEXT: s_add_i32 s0, s0, 1
+; GFX11-NEXT: s_add_i32 s3, s3, 4
+; GFX11-NEXT: s_add_i32 s2, s2, 3
; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
; GFX11-NEXT: v_mov_b32_e32 v2, s2
@@ -980,12 +980,11 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
;
; GFX1250-LABEL: ps_mesa_inreg_v5i32:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_add_co_i32 s3, s3, 4
-; GFX1250-NEXT: s_add_co_i32 s2, s2, 3
; GFX1250-NEXT: s_add_co_i32 s1, s1, 2
; GFX1250-NEXT: s_add_co_i32 s4, s4, 5
; GFX1250-NEXT: s_add_co_i32 s0, s0, 1
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_add_co_i32 s3, s3, 4
+; GFX1250-NEXT: s_add_co_i32 s2, s2, 3
; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
; GFX1250-NEXT: v_mov_b32_e32 v3, s3
@@ -1014,22 +1013,22 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
;
; VI-LABEL: ps_mesa_inreg_v5f32:
; VI: ; %bb.0:
-; VI-NEXT: v_add_f32_e64 v3, s3, -1.0
-; VI-NEXT: v_add_f32_e64 v2, s2, 4.0
; VI-NEXT: v_add_f32_e64 v1, s1, 2.0
; VI-NEXT: v_add_f32_e64 v0, s0, 1.0
; VI-NEXT: v_add_f32_e64 v4, s4, 0.5
+; VI-NEXT: v_add_f32_e64 v3, s3, -1.0
+; VI-NEXT: v_add_f32_e64 v2, s2, 4.0
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: ps_mesa_inreg_v5f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_f32_e64 v3, s3, -1.0
-; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0
; GFX11-NEXT: v_add_f32_e64 v1, s1, 2.0
; GFX11-NEXT: v_add_f32_e64 v4, s4, 0.5
; GFX11-NEXT: v_add_f32_e64 v0, s0, 1.0
+; GFX11-NEXT: v_add_f32_e64 v3, s3, -1.0
+; GFX11-NEXT: v_add_f32_e64 v2, s2, 4.0
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
@@ -1037,13 +1036,13 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
;
; GFX1250-LABEL: ps_mesa_inreg_v5f32:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: s_add_f32 s3, s3, -1.0
; GFX1250-NEXT: s_add_f32 s4, s4, 0.5
; GFX1250-NEXT: s_add_f32 s0, s0, 1.0
; GFX1250-NEXT: s_add_f32 s1, s1, 2.0
+; GFX1250-NEXT: s_add_f32 s3, s3, -1.0
; GFX1250-NEXT: s_add_f32 s2, s2, 4.0
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_2)
; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
; GFX1250-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
; GFX1250-NEXT: v_mov_b32_e32 v3, s3
; GFX1250-NEXT: s_clause 0x1
@@ -1148,22 +1147,22 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
;
; VI-LABEL: ps_mesa_v5i32:
; VI: ; %bb.0:
-; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v3
-; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
; VI-NEXT: v_add_u32_e32 v1, vcc, 2, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
; VI-NEXT: v_add_u32_e32 v4, vcc, 5, v4
+; VI-NEXT: v_add_u32_e32 v3, vcc, 4, v3
+; VI-NEXT: v_add_u32_e32 v2, vcc, 3, v2
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: ps_mesa_v5i32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v3
-; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v1
; GFX11-NEXT: v_add_nc_u32_e32 v4, 5, v4
; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 3, v2
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
@@ -1171,9 +1170,9 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
;
; GFX1250-LABEL: ps_mesa_v5i32:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_dual_add_nc_u32 v3, 4, v3 :: v_dual_add_nc_u32 v2, 3, v2
; GFX1250-NEXT: v_dual_add_nc_u32 v1, 2, v1 :: v_dual_add_nc_u32 v4, 5, v4
-; GFX1250-NEXT: v_add_nc_u32_e32 v0, 1, v0
+; GFX1250-NEXT: v_dual_add_nc_u32 v0, 1, v0 :: v_dual_add_nc_u32 v3, 4, v3
+; GFX1250-NEXT: v_add_nc_u32_e32 v2, 3, v2
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off
@@ -1199,20 +1198,20 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
;
; VI-LABEL: ps_mesa_v5f32:
; VI: ; %bb.0:
-; VI-NEXT: v_add_f32_e32 v3, -1.0, v3
-; VI-NEXT: v_add_f32_e32 v2, 4.0, v2
; VI-NEXT: v_add_f32_e32 v1, 2.0, v1
; VI-NEXT: v_add_f32_e32 v0, 1.0, v0
; VI-NEXT: v_add_f32_e32 v4, 0.5, v4
+; VI-NEXT: v_add_f32_e32 v3, -1.0, v3
+; VI-NEXT: v_add_f32_e32 v2, 4.0, v2
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_dwordx4 v[0:1], v[0:3]
; VI-NEXT: s_endpgm
;
; GFX11-LABEL: ps_mesa_v5f32:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
; GFX11-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
-; GFX11-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v3, -1.0, v3
+; GFX11-NEXT: v_add_f32_e32 v2, 4.0, v2
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b32 v[0:1], v4, off
; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off
@@ -1220,9 +1219,9 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
;
; GFX1250-LABEL: ps_mesa_v5f32:
; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
; GFX1250-NEXT: v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
-; GFX1250-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT: v_dual_add_f32 v0, 1.0, v0 :: v_dual_add_f32 v3, -1.0, v3
+; GFX1250-NEXT: v_add_f32_e32 v2, 4.0, v2
; GFX1250-NEXT: s_clause 0x1
; GFX1250-NEXT: global_store_b32 v[0:1], v4, off
; GFX1250-NEXT: global_store_b128 v[0:1], v[0:3], off
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9233f8059a202..2c44dd1fc3b2e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8104,8 +8104,8 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
; GFX9-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_f32_e32 v3, 0.5, v1
-; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1
; GFX9-NEXT: v_med3_f32 v2, v3, 1.0, v2
+; GFX9-NEXT: v_add_f32_e32 v1, 0x41800000, v1
; GFX9-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-NEXT: global_store_dword v[0:1], v1, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
@@ -8260,9 +8260,9 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x41000000
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX9-SDAG-NEXT: v_add_f32_e32 v3, 0.5, v1
+; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s2, v2
; GFX9-SDAG-NEXT: v_add_f32_e32 v4, 0x41800000, v1
; GFX9-SDAG-NEXT: v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-SDAG-NEXT: v_med3_f32 v2, v3, s2, v2
; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-SDAG-NEXT: global_store_dword v[0:1], v4, off
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -8280,9 +8280,9 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_add_f32_e32 v4, 0.5, v1
+; GFX9-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
; GFX9-GISEL-NEXT: v_add_f32_e32 v5, 0x41800000, v1
; GFX9-GISEL-NEXT: v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-GISEL-NEXT: v_med3_f32 v2, v4, v2, v3
; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1]
; GFX9-GISEL-NEXT: global_store_dword v[0:1], v5, off
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 5b2213592f495..ff118ab33617a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -6669,8 +6669,8 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: s_bfe_u32 s4, s2, 0x10004
; GFX12-NEXT: s_bfe_u32 s3, s2, 0x10009
-; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001
; GFX12-NEXT: v_lshrrev_b32_e32 v10, 15, v4
+; GFX12-NEXT: s_bfe_u32 s5, s2, 0x10001
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
; GFX12-NEXT: s_wait_alu 0xfffe
; GFX12-NEXT: v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir b/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir
index a2a0794ac59f3..aeb54bc080d58 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-image-sample-post-RA.mir
@@ -94,10 +94,10 @@ body: |
; BOTTOMUP-NEXT: renamable $vgpr11 = IMAGE_SAMPLE_V1_V2_gfx11 $vgpr9_vgpr10, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
; BOTTOMUP-NEXT: renamable $vgpr5_vgpr6_vgpr7_vgpr8 = IMAGE_SAMPLE_V4_V2_gfx11 killed $vgpr9_vgpr10, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39, 15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), addrspace 8)
; BOTTOMUP-NEXT: }
- ; BOTTOMUP-NEXT: renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec
; BOTTOMUP-NEXT: nofpexcept V_CMP_GT_F32_e32 1065353216, killed $vgpr11, implicit-def $vcc_lo, implicit $mode, implicit $exec
; BOTTOMUP-NEXT: renamable $sgpr0_sgpr1 = COPY $vcc
; BOTTOMUP-NEXT: nofpexcept V_CMP_GT_F32_e32 1065353216, killed $vgpr8, implicit-def $vcc_lo, implicit $mode, implicit $exec
+ ; BOTTOMUP-NEXT: renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec
; BOTTOMUP-NEXT: renamable $sgpr2_sgpr3 = S_AND_B64 killed renamable $sgpr0_sgpr1, killed renamable $vcc, implicit-def dead $scc
; BOTTOMUP-NEXT: renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec
; BOTTOMUP-NEXT: renamable $vgpr12 = V_MOV_B32_e32 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
index 77e67b2732481..27908957b5886 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
@@ -15,7 +15,7 @@
# CHECK-NEXT: SU(0): Data Latency=1 Reg=$vgpr0
# CHECK: Successors:
# CHECK-NEXT: SU(4): Out Latency=1
-# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
+# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0
# CHECK-NEXT: SU(3): Out Latency=1
# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vcc
# CHECK: SU(3): $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def dead $vcc, implicit $vcc, implicit $exec
@@ -26,13 +26,13 @@
# CHECK-NEXT: SU(1): Data Latency=1 Reg=$vgpr1
# CHECK: Successors:
# CHECK-NEXT: SU(4): Out Latency=1
-# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
+# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr1
# CHECK: SU(4): $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
# CHECK: Predecessors:
# CHECK-NEXT: SU(3): Out Latency=1
-# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr0_vgpr1
+# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr1
# CHECK-NEXT: SU(2): Out Latency=1
-# CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0_vgpr1
+# CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0
# CHECK: Successors:
# CHECK-NEXT: ExitSU: Ord Latency=3 Artificial
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index 7a3bff8aed56e..ead040fd14280 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -701,15 +701,15 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -722,10 +722,10 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
@@ -891,15 +891,15 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -912,10 +912,10 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
@@ -995,8 +995,8 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
@@ -1724,15 +1724,15 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -1745,10 +1745,10 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
@@ -1914,15 +1914,15 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -1935,10 +1935,10 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
@@ -2018,8 +2018,8 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
@@ -2747,15 +2747,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -2768,10 +2768,10 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
@@ -2937,15 +2937,15 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -2958,10 +2958,10 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
@@ -3041,8 +3041,8 @@ define amdgpu_cs float @cs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
@@ -3767,15 +3767,15 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -3788,10 +3788,10 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
@@ -3957,15 +3957,15 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -3978,10 +3978,10 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
@@ -4061,8 +4061,8 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
@@ -4787,15 +4787,15 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -4808,10 +4808,10 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
@@ -4977,15 +4977,15 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -4998,10 +4998,10 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
@@ -5081,8 +5081,8 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
@@ -5817,15 +5817,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -5838,10 +5838,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
@@ -6009,15 +6009,15 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -6030,10 +6030,10 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
@@ -6113,9 +6113,9 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
@@ -6848,15 +6848,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -6869,10 +6869,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-NEXT: v_mov_b32_e32 v29, v12
@@ -7040,15 +7040,15 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v11, 0xbefcd89f
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0xbefcd8a3
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v10, v9
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v20, 0xbf5f2ee3
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[5:8], off offset:304
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[1:4], off offset:288
; GFX10-FLATSCR-PAL-NEXT: scratch_store_dwordx4 off, v[9:12], off offset:272
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v12, 0x3eae29dc
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v14, 0x3e319356
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v16, 0xbf3d349e
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v18, 0x3efcd89f
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v23, 0xbf523be3
; GFX10-FLATSCR-PAL-NEXT: v_add_nc_u32_e32 v39, 0x200, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v35, v0
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v0, 0xb702e758
@@ -7061,10 +7061,10 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v19, 0x3efcd89c
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v21, 0xbf638e39
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v22, v20
+; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v24, 0x3f20e7f5
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v25, v16
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v26, v23
-; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v4, 0x3f20e7f4
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v27, 0x3703c499
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v28, v14
; GFX10-FLATSCR-PAL-NEXT: v_mov_b32_e32 v29, v12
@@ -7144,9 +7144,9 @@ define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg,
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v14, 0x3eae29d8
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v16, 0x3e31934f
; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v22, 0xbf638e39
+; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v20, 0x3efcd89c
; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v25, 0x3f20e7f5 :: v_dual_mov_b32 v26, v17
-; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v20, 0x3efcd89c :: v_dual_mov_b32 v29, v15
-; GFX11-FLATSCR-NEXT: v_mov_b32_e32 v30, v13
+; GFX11-FLATSCR-NEXT: v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v30, v13
; GFX11-FLATSCR-NEXT: s_clause 0x1
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[0:3], off offset:272
; GFX11-FLATSCR-NEXT: scratch_store_b128 off, v[9:12], off offset:256
More information about the llvm-commits
mailing list