[llvm] AMDGPU/GlobalISel: Uniformity info based regbankselect (PR #73684)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 28 10:37:10 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Petar Avramovic (petar-avramovic)
<details>
<summary>Changes</summary>
RFC for approach in rewriting reg-bank-selection using machine uniformity info analysis.
Implemented only for one opcode in this patch.
Current proposal in to pre-select register banks on dst registers using MUI before regbankselection starts.
Then using pre-assigned dst reg banks, instead of looking into register banks for inputs, select register banks for instructions so that available machine instructions could be instruction-selected
---
Patch is 241.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/73684.diff
14 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp (+21-1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+165-37)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h (+26-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll (+48-26)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/floor.f64.ll (+16)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll (+11-6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fadd.mir (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll (+328-310)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll (+142-124)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp.ll (+227-172)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.exp2.ll (+101-65)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log.ll (+279-161)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.log10.ll (+279-161)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
index 2ea03ddb1fccd6c..ca9c70e7ed4a017 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
@@ -15,7 +15,10 @@
#include "AMDGPURegBankSelect.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
#define DEBUG_TYPE "regbankselect"
@@ -68,7 +71,24 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
MachineUniformityInfo Uniformity =
computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
!ST.isSingleLaneExecution(F));
- (void)Uniformity; // TODO: Use this
+
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_FADD: {
+ Register Dst = MI.getOperand(0).getReg();
+ if (Uniformity.isUniform(Dst)) {
+ MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::SGPRRegBankID));
+ } else {
+ MRI->setRegBank(Dst, RBI->getRegBank(AMDGPU::VGPRRegBankID));
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ }
+ }
assignRegisterBanks(MF);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 62996a3b3fb79fb..b8d6a3bd3c0e6fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -695,58 +695,108 @@ static LLT getHalfSizedType(LLT Ty) {
// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
// source value into a scalar register.
-Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
- MachineRegisterInfo &MRI,
- Register Src) const {
+Register AMDGPURegisterBankInfo::buildReadFirstLaneSrc(MachineIRBuilder &B,
+ Register Src) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
LLT Ty = MRI.getType(Src);
const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
- if (Bank == &AMDGPU::SGPRRegBank)
- return Src;
-
- unsigned Bits = Ty.getSizeInBits();
- assert(Bits % 32 == 0);
-
if (Bank != &AMDGPU::VGPRRegBank) {
// We need to copy from AGPR to VGPR
Src = B.buildCopy(Ty, Src).getReg(0);
MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
}
+ Register Dst = MRI.createGenericVirtualRegister(Ty);
+ MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
+ buildReadFirstLaneForType(B, Dst, Src);
+ return Dst;
+}
+
+// Create new vgpr destination register for MI then move it to current
+// MI's sgpr destination using one or more V_READFIRSTLANE_B32 instructions.
+void AMDGPURegisterBankInfo::buildReadFirstLaneDst(MachineIRBuilder &B,
+ MachineInstr &MI) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstBank = getRegBank(Dst, MRI, *TRI);
+ if (DstBank != &AMDGPU::SGPRRegBank)
+ return;
+
+ Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
+ MRI.setRegBank(VgprDst, AMDGPU::VGPRRegBank);
+
+ MI.getOperand(0).setReg(VgprDst);
+ MachineBasicBlock *MBB = MI.getParent();
+ B.setInsertPt(*MBB, std::next(MI.getIterator()));
+ // readFirstLane VgprDst into Dst after MI.
+ return buildReadFirstLaneForType(B, Dst, VgprDst);
+}
+
+void AMDGPURegisterBankInfo::buildReadFirstLaneB32(MachineIRBuilder &B,
+ Register SgprDst,
+ Register VgprSrc) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {SgprDst}, {VgprSrc});
+ MRI.setRegClass(VgprSrc, &AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(SgprDst, &AMDGPU::SReg_32RegClass);
+}
+
+void AMDGPURegisterBankInfo::buildReadFirstLaneSequenceOfB32(
+ MachineIRBuilder &B, Register SgprDst, Register VgprSrc,
+ unsigned NumElts) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
LLT S32 = LLT::scalar(32);
- unsigned NumParts = Bits / 32;
- SmallVector<Register, 8> SrcParts;
- SmallVector<Register, 8> DstParts;
+ SmallVector<Register, 8> VgprSrcParts;
+ SmallVector<Register, 8> SgprDstParts;
- if (Bits == 32) {
- SrcParts.push_back(Src);
- } else {
- auto Unmerge = B.buildUnmerge(S32, Src);
- for (unsigned i = 0; i < NumParts; ++i)
- SrcParts.push_back(Unmerge.getReg(i));
+ for (unsigned i = 0; i < NumElts; ++i) {
+ VgprSrcParts.push_back(MRI.createGenericVirtualRegister(S32));
+ SgprDstParts.push_back(MRI.createGenericVirtualRegister(S32));
}
- for (unsigned i = 0; i < NumParts; ++i) {
- Register SrcPart = SrcParts[i];
- Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
+ B.buildUnmerge(VgprSrcParts, VgprSrc);
+ for (unsigned i = 0; i < NumElts; ++i) {
+ buildReadFirstLaneB32(B, SgprDstParts[i], VgprSrcParts[i]);
+ }
+ B.buildMergeLikeInstr(SgprDst, SgprDstParts);
+}
- const TargetRegisterClass *Constrained =
- constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
- (void)Constrained;
- assert(Constrained && "Failed to constrain readfirstlane src reg");
+void AMDGPURegisterBankInfo::buildReadFirstLaneForType(MachineIRBuilder &B,
+ Register SgprDst,
+ Register VgprSrc) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ LLT S16 = LLT::scalar(16);
+ LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
+ LLT Ty = MRI.getType(SgprDst);
+
+ if (Ty == S16) {
+ Register VgprSrc32 = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegBank(VgprSrc32, AMDGPU::VGPRRegBank);
+ Register SgprDst32 = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegBank(SgprDst32, AMDGPU::SGPRRegBank);
+
+ B.buildAnyExt(VgprSrc32, VgprSrc);
+ buildReadFirstLaneB32(B, SgprDst32, VgprSrc32);
+ B.buildTrunc(SgprDst, SgprDst32);
+ return;
+ }
- B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
+ if (Ty == S32 || Ty == LLT::pointer(3, 32)) {
+ return buildReadFirstLaneB32(B, SgprDst, VgprSrc);
+ }
- DstParts.push_back(DstPart);
+ if (Ty == S64 || Ty == LLT::pointer(0, 64) || Ty == LLT::pointer(1, 64)) {
+ return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc, 2);
}
- if (Bits == 32)
- return DstParts[0];
+ if (Ty.isVector() && Ty.getElementType() == S32) {
+ return buildReadFirstLaneSequenceOfB32(B, SgprDst, VgprSrc,
+ Ty.getNumElements());
+ }
- Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
- MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
- return Dst;
+ llvm_unreachable("Type not supported");
}
/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
@@ -883,7 +933,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.setMBB(*LoopBB);
}
- Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
+ Register CurrentLaneReg = buildReadFirstLaneSrc(B, OpReg);
// Build the comparison(s).
unsigned OpSize = OpTy.getSizeInBits();
@@ -1015,10 +1065,23 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
if (Bank == &AMDGPU::SGPRRegBank)
return;
- Reg = buildReadFirstLane(B, MRI, Reg);
+ Reg = buildReadFirstLaneSrc(B, Reg);
MI.getOperand(OpIdx).setReg(Reg);
}
+// MI has uniform inputs and output but only available machine instruction has
+// vgpr dest. Make it uniform by moving dst to sgpr using readfirstlane.
+void AMDGPURegisterBankInfo::constrainVgprDstOpWithReadfirstlane(
+ MachineIRBuilder &B, MachineInstr &MI,
+ const OperandsMapper &OpdMapper) const {
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank != &AMDGPU::VGPRRegBank)
+ buildReadFirstLaneDst(B, MI);
+
+ return;
+}
+
/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
/// rest will be in the remainder.
static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
@@ -1591,7 +1654,7 @@ bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
if (!DstOnValu) {
- DstHi = buildReadFirstLane(B, MRI, DstHi);
+ DstHi = buildReadFirstLaneSrc(B, DstHi);
} else {
MulHiInVgpr = true;
}
@@ -2100,6 +2163,17 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
B.setInstrAndDebugLoc(MI);
unsigned Opc = MI.getOpcode();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+ switch (Opc) {
+ case AMDGPU::G_FADD:
+ applyDefaultMapping(OpdMapper);
+ if (!Subtarget.hasSALUFloatInsts())
+ constrainVgprDstOpWithReadfirstlane(B, MI, OpdMapper);
+ return;
+ default:
+ break;
+ }
+
switch (Opc) {
case AMDGPU::G_CONSTANT:
case AMDGPU::G_IMPLICIT_DEF: {
@@ -3355,6 +3429,28 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
MI.getNumOperands());
}
+const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getDefaultMappingVOPWithPreassignedDef(
+ const MachineInstr &MI) const {
+ SmallVector<const ValueMapping *, 8> OpdsMapping(MI.getNumOperands());
+ const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ // Dst reg bank should have been set already by uniformity info
+ OpdsMapping[0] =
+ getPreAssignedOpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+ for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i) {
+ const MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg())
+ continue;
+
+ unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
+ unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
+ OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
+ }
+ return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
+ MI.getNumOperands());
+}
+
const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
@@ -3507,6 +3603,22 @@ AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
}
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getPreAssignedOpMapping(
+ Register Reg, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ const RegisterBank *Bank = getRegBank(Reg, MRI, TRI);
+ assert(Bank);
+ unsigned BankId = Bank->getID();
+ unsigned Size = getSizeInBits(Reg, MRI, TRI);
+ if (Size != 1)
+ assert(BankId == AMDGPU::SGPRRegBankID || BankId == AMDGPU::VGPRRegBankID);
+ else
+ assert(BankId == AMDGPU::SGPRRegBankID || BankId == AMDGPU::VCCRegBankID);
+
+ return AMDGPU::getValueMapping(BankId, Size);
+}
+
const RegisterBankInfo::ValueMapping *
AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
const MachineRegisterInfo &MRI,
@@ -3623,6 +3735,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+ // Switch for uniformity info based regbank selection.
+ // Does not inspect register bank on incoming operands.
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_FADD: {
+ if (!Subtarget.hasSALUFloatInsts())
+ return getDefaultMappingVOPWithPreassignedDef(MI);
+
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
+ default:
+ break;
+ }
+
switch (MI.getOpcode()) {
default:
return getInvalidInstructionMapping();
@@ -3718,7 +3847,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
- case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index b5d16e70ab23a20..c4a130eb147fea7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -57,14 +57,29 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
iterator_range<MachineBasicBlock::iterator> Range,
SmallSet<Register, 4> &SGPROperandRegs) const;
- Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI,
- Register Src) const;
+ Register buildReadFirstLaneSrc(MachineIRBuilder &B, Register Src) const;
+
+ void buildReadFirstLaneDst(MachineIRBuilder &B, MachineInstr &MI) const;
+
+ void buildReadFirstLaneForType(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc) const;
+
+ void buildReadFirstLaneB32(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc) const;
+
+ void buildReadFirstLaneSequenceOfB32(MachineIRBuilder &B, Register SgprDst,
+ Register VgprSrc,
+ unsigned NumElts) const;
bool executeInWaterfallLoop(MachineIRBuilder &B, MachineInstr &MI,
ArrayRef<unsigned> OpIndices) const;
void constrainOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
unsigned OpIdx) const;
+ void
+ constrainVgprDstOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
+ const OperandsMapper &OpdMapper) const;
+
bool applyMappingDynStackAlloc(MachineIRBuilder &B,
const OperandsMapper &OpdMapper,
MachineInstr &MI) const;
@@ -113,6 +128,12 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI) const;
+ // Return a value mapping for an operand that is same as already assigned
+ // reg bank or corresponds to assigned register class + LLT
+ const ValueMapping *
+ getPreAssignedOpMapping(Register Reg, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
// Return a value mapping for an operand that is required to be a AGPR.
const ValueMapping *getAGPROpMapping(Register Reg,
const MachineRegisterInfo &MRI,
@@ -152,6 +173,9 @@ class AMDGPURegisterBankInfo final : public AMDGPUGenRegisterBankInfo {
const InstructionMapping &getDefaultMappingSOP(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
+ const InstructionMapping &
+ getDefaultMappingVOPWithPreassignedDef(const MachineInstr &MI) const;
+
const InstructionMapping &getDefaultMappingAllVGPR(
const MachineInstr &MI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
index f3e561578363967..8c981f92c921272 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-add-ext-mul.ll
@@ -49,21 +49,31 @@ define amdgpu_vs <5 x float> @test_5xf16_5xf32_add_ext_mul(<5 x half> inreg %x,
; GFX9-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s7, v0
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s1, v0
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s8, v4
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s2, v0
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s9, v1
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s6, v3
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s3, v0
+; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s10, v2
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s0, v3
+; GFX9-FAST-DENORM-NEXT: v_readfirstlane_b32 s4, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v4, s4
; GFX9-FAST-DENORM-NEXT: ; return to shader part epilog
;
; GFX10-FAST-DENORM-LABEL: test_5xf16_5xf32_add_ext_mul:
@@ -90,23 +100,35 @@ define amdgpu_vs <6 x float> @test_6xf16_6xf32_add_ext_mul_rhs(<6 x half> inreg
; GFX9-FAST-DENORM-LABEL: test_6xf16_6xf32_add_ext_mul_rhs:
; GFX9-FAST-DENORM: ; %bb.0: ; %.entry
; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v0, s3
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
-; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v0, s0, v0
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v1, s4
; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v1, s1, v1
-; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v0, s6, v3
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v1, s7, v4
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v2, s8, v5
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v3, s9, v6
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v4, s10, v7
-; GFX9-FAST-DENORM-NEXT: v_add_f32_e32 v5, s11, v8
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FAST-DENORM-NEXT: v_mov_b32_e32 v2, s5
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX9-FAST-DENORM-NEXT: v_pk_mul_f16 v2, s2, v2
+; GFX9-FAST-DENORM-NEXT: v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FAST-DENORM-NEXT: v_c...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/73684
More information about the llvm-commits
mailing list