[llvm] [AMDGPU] Simplify and improve codegen for llvm.amdgcn.set.inactive (PR #107889)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 11 01:48:32 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
@llvm/pr-subscribers-llvm-globalisel
Author: Jay Foad (jayfoad)
<details>
<summary>Changes</summary>
Always generate v_cndmask_b32 instead of modifying exec around v_mov_b32. This is expected to be faster because
modifying exec generally causes pipeline stalls.
---
Patch is 792.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/107889.diff
25 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+10-4)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+11-5)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+8-155)
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (-2)
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+4-17)
- (modified) llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp (+1-2)
- (modified) llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp (+30-40)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll (+8-4)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll (+6-3)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll (+110-133)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+368-390)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll (+1866-1944)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll (+8-5)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll (+20-17)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll (+270-280)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll (+174-183)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll (+174-183)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll (+270-280)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.chain.arg.ll (+28-27)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll (+113-140)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/wqm.mir (+6-23)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+351-297)
- (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved.ll (+334-326)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 56f4efda7925f1..e657f668cc656a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5439,6 +5439,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;
+ bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+ IID == Intrinsic::amdgcn_set_inactive_chain_arg;
auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
Register Src2, LLT VT) -> Register {
@@ -5448,6 +5450,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
return LaneOp.addUse(Src1).getReg(0);
case Intrinsic::amdgcn_writelane:
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
@@ -5472,7 +5476,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
- IsPermLane16) {
+ IsSetInactive || IsPermLane16) {
Src1 = MI.getOperand(3).getReg();
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src2 = MI.getOperand(4).getReg();
@@ -5490,7 +5494,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
if (Size < 32) {
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
if (IID == Intrinsic::amdgcn_writelane)
@@ -5526,7 +5530,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
MachineInstrBuilder Src1Parts, Src2Parts;
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1Parts = B.buildUnmerge(PartialResTy, Src1);
if (IID == Intrinsic::amdgcn_writelane)
@@ -5535,7 +5539,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
for (unsigned i = 0; i < NumParts; ++i) {
Src0 = Src0Parts.getReg(i);
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1 = Src1Parts.getReg(i);
if (IID == Intrinsic::amdgcn_writelane)
@@ -7496,6 +7500,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
return legalizeLaneOp(Helper, MI, IntrID);
case Intrinsic::amdgcn_s_buffer_prefetch_data:
return legalizeSBufferPrefetch(Helper, MI);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 25cb8341c51d53..04d95693f75998 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6108,6 +6108,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
unsigned IID = N->getConstantOperandVal(0);
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;
+ bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+ IID == Intrinsic::amdgcn_set_inactive_chain_arg;
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);
@@ -6125,6 +6127,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Operands.push_back(Src2);
[[fallthrough]];
case Intrinsic::amdgcn_readlane:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
Operands.push_back(Src1);
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
@@ -6151,7 +6155,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
- IsPermLane16) {
+ IsSetInactive || IsPermLane16) {
Src1 = N->getOperand(2);
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
Src2 = N->getOperand(3);
@@ -6167,7 +6171,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
SL, MVT::i32);
- if (IsPermLane16) {
+ if (IsSetInactive || IsPermLane16) {
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
SL, MVT::i32);
}
@@ -6243,7 +6247,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
DAG.getConstant(EltIdx, SL, MVT::i32));
@@ -6252,7 +6256,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(EltIdx, SL, MVT::i32));
Pieces.push_back(
- IsPermLane16
+ IsSetInactive || IsPermLane16
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
EltIdx += 2;
@@ -6268,7 +6272,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
Src0 = DAG.getBitcast(VecVT, Src0);
- if (IsPermLane16)
+ if (IsSetInactive || IsPermLane16)
Src1 = DAG.getBitcast(VecVT, Src1);
if (IID == Intrinsic::amdgcn_writelane)
@@ -8751,6 +8755,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
+ case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c6f28af1e5e731..59900d7e7fe9eb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2098,21 +2098,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
}
}
-Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
- assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
- MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
- for (auto &Op : MI.implicit_operands()) {
- if (Op.isDef())
- continue;
- Register OpReg = Op.getReg();
- if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
- OpReg == AMDGPU::SCC)
- continue;
- return OpReg;
- }
- return Register();
-}
-
bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2287,147 +2272,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_SET_INACTIVE_B32:
- case AMDGPU::V_SET_INACTIVE_B64: {
- unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
- ? AMDGPU::V_MOV_B64_PSEUDO
- : AMDGPU::V_MOV_B32_e32;
- Register ExecReg = RI.getExec();
+ case AMDGPU::V_SET_INACTIVE_B32: {
+ // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
Register DstReg = MI.getOperand(0).getReg();
- MachineOperand &ActiveSrc = MI.getOperand(1);
- MachineOperand &InactiveSrc = MI.getOperand(2);
-
- // Find implicit register defining lanes active outside WWM.
- Register ExecSrcReg = findSetInactiveMask(MI);
- assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
- // Note: default here is set to ExecReg so that functional MIR is still
- // generated if implicit def is not found and assertions are disabled.
- if (!ExecSrcReg)
- ExecSrcReg = ExecReg;
-
- // Ideally in WWM this operation is lowered to V_CNDMASK; however,
- // constant bus constraints and the presence of literal constants
- // present an issue.
- // Fallback to V_MOV base lowering in all but the common cases.
- const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
- MachineFunction *MF = MBB.getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
- const MCInstrDesc &Desc = get(Opcode);
-
- const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
- const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
- const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
- const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
- const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
- const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
-
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
- int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
-
- int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
- int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
- int ConstantBusUses =
- 1 + // Starts at 1 for ExecSrcReg
- (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
- (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
- int LiteralConstants =
- ((ActiveSrc.isReg() ||
- (ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
- ? 0
- : 1) +
- ((InactiveSrc.isReg() ||
- (InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
- ? 0
- : 1);
-
- bool UseVCndMask =
- ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
- if (VMov64 && UseVCndMask) {
- // Decomposition must not introduce new literals.
- UseVCndMask &=
- ActiveSrc.isReg() ||
- (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
- (!isInlineConstant(ActiveImm));
- UseVCndMask &= InactiveSrc.isReg() ||
- (isInlineConstant(InactiveImmLo) &&
- isInlineConstant(InactiveImmHi)) ||
- (!isInlineConstant(InactiveImm));
- }
-
- if (UseVCndMask && VMov64) {
- // Dual V_CNDMASK_B32
- MachineOperand ActiveLo = buildExtractSubRegOrImm(
- MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
- MachineOperand ActiveHi = buildExtractSubRegOrImm(
- MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
- MachineOperand InactiveLo = buildExtractSubRegOrImm(
- MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
- MachineOperand InactiveHi = buildExtractSubRegOrImm(
- MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
- if (ActiveSrc.isReg())
- ActiveHi.setIsKill(ActiveSrc.isKill());
- if (InactiveSrc.isReg())
- InactiveHi.setIsKill(InactiveSrc.isKill());
- BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
- .addImm(0)
- .add(InactiveLo)
- .addImm(0)
- .add(ActiveLo)
- .addReg(ExecSrcReg)
- .addReg(DstReg, RegState::ImplicitDefine);
- BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
- .addImm(0)
- .add(InactiveHi)
- .addImm(0)
- .add(ActiveHi)
- .addReg(ExecSrcReg)
- .addReg(DstReg, RegState::ImplicitDefine);
- } else if (UseVCndMask) {
- // Single V_CNDMASK_B32
- BuildMI(MBB, MI, DL, Desc, DstReg)
- .addImm(0)
- .add(InactiveSrc)
- .addImm(0)
- .add(ActiveSrc)
- .addReg(ExecSrcReg);
- } else {
- // Fallback V_MOV case.
- // Avoid unnecessary work if a source VGPR is also the destination.
- // This can happen if WWM register allocation was efficient.
- // Note: this assumes WWM execution.
- bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
- bool DstIsInactive =
- InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
- if (!DstIsInactive) {
- // Set exec mask to inactive lanes,
- // but only if active lanes would be overwritten.
- if (DstIsActive) {
- BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
- .addReg(ExecSrcReg)
- .setOperandDead(3); // Dead scc
- }
- // Copy inactive lanes
- MachineInstr *VMov =
- BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
- if (VMov64)
- expandPostRAPseudo(*VMov);
- }
- if (!DstIsActive) {
- // Set exec mask to active lanes
- BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
- // Copy active lanes
- MachineInstr *VMov =
- BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
- .add(ActiveSrc);
- if (VMov64)
- expandPostRAPseudo(*VMov);
- }
- // Restore WWM
- BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
- }
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+ .add(MI.getOperand(3))
+ .add(MI.getOperand(4))
+ .add(MI.getOperand(1))
+ .add(MI.getOperand(2))
+ .add(MI.getOperand(5));
MI.eraseFromParent();
break;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 71432510fdee4f..4fd9b4366159be 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1437,8 +1437,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
// This is used if an operand is a 32 bit register but needs to be aligned
// regardless.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
-
- static Register findSetInactiveMask(const MachineInstr &MI);
};
/// \brief Returns true if a reg:subreg pair P has a TRC class
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b7543238c1300a..5df595ff2cf4a7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -243,29 +243,16 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))),
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
// restoring it after we're done.
-let Defs = [SCC], isConvergent = 1 in {
-def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
- (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
-
-def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
- (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
-} // End Defs = [SCC]
+let isConvergent = 1 in
+def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>;
foreach vt = Reg32Types.types in {
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
- (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
-}
-
-foreach vt = Reg64Types.types in {
-def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
- (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+ (V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>;
}
def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
- (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
-
-def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)),
- (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>;
+ (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index 29fef49ee70954..3bf2ea0f9e53ef 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -215,8 +215,7 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock *MBB : RPOT) {
bool InWWM = false;
for (MachineInstr &MI : *MBB) {
- if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
- MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
+ if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32)
RegsAssigned |= processDef(MI.getOperand(0));
if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) {
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f9d7ead4ff3ecc..8064c07310d09c 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -557,26 +557,18 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// This avoid unnecessarily marking M0 as requiring WQM.
III.Needs |= StateStrictWQM;
GlobalFlags |= StateStrictWQM;
- } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
- Opcode == AMDGPU::V_SET_INACTIVE_B64) {
- // Ignore these if V_SET_INACTIVE which already has exec src register.
- // These are generated by an earlier pass which has seperately ensured
- // WWM and provided a mask of inactive lanes.
- Register ExecSrc = TII->findSetInactiveMask(MI);
- if (!ExecSrc) {
- // Disable strict states; StrictWQM will be added as required later.
- III.Disabled = StateStrict;
- MachineOperand &Inactive = MI.getOperand(2);
- if (Inactive.isReg()) {
- if (Inactive.isUndef()) {
- LowerToCopyInstrs.insert(&MI);
- } else {
- markOperand(MI, Inactive, StateStrictWWM, Worklist);
- }
- }
- SetInactiveInstrs.push_back(&MI);
- BBI.NeedsLowering = true;
+ } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32) {
+ // Disable strict states; StrictWQM will be added as required later.
+ III.Disabled = StateStrict;
+ MachineOperand &Inactive = MI.getOperand(4);
+ if (Inactive.isReg()) {
+ if (Inactive.isUndef() && MI.getOperand(3).getImm() == 0)
+ LowerToCopyInstrs.insert(&MI);
+ else
+ markOperand(MI, Inactive, StateStrictWWM, Worklist);
}
+ SetInactiveInstrs.push_back(&MI);
+ BBI.NeedsLowering = true;
} else if (TII->isDisableWQM(MI)) {
BBI.Needs |= StateExact;
if (!(BBI.InNeeds & StateExact)) {
@@ -1078,10 +1070,12 @@ void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
ActiveLanesReg = 0;
break;
case AMDGPU::V_SET_INACTIVE_B32:
- case AMDGPU::V_SET_INACTIVE_B64:
if (ActiveLanesReg) {
- MI.addOperand(*MBB.getParent(),
- MachineOperand::CreateReg(ActiveLanesReg, false, true));
+ LiveInterval &LI = LIS->getInterval(MI.getOperand(5).getReg());
+ MRI->constrainRegClass(
+ ActiveLanesReg, TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
+ MI.getOperand(5).setReg(ActiveLanesReg);
+ LIS->shrinkToUses(&LI);
} else {
assert(State == StateExact || State == StateWQM);
}
@@ -1527,13 +1521,20 @@ bool SIWholeQuadMode::lowerCopyInstrs() {
for (MachineInstr *MI : LowerToCopyInstrs) {
LLVM_DEBUG(dbgs() << "simplify: " << *MI);
- Register RecomputeReg = 0;
- if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
- ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/107889
More information about the llvm-commits
mailing list