[llvm] [AMDGPU] V_SET_INACTIVE optimizations (PR #98864)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 29 01:59:37 PDT 2024
================
@@ -2273,37 +2273,162 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_SET_INACTIVE_B32: {
- unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
- MI.eraseFromParent();
- break;
- }
+ case AMDGPU::V_SET_INACTIVE_B32:
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- expandPostRAPseudo(*Copy);
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- expandPostRAPseudo(*Copy);
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+ ? AMDGPU::V_MOV_B64_PSEUDO
+ : AMDGPU::V_MOV_B32_e32;
+ Register ExecReg = RI.getExec();
+ Register DstReg = MI.getOperand(0).getReg();
+ MachineOperand &ActiveSrc = MI.getOperand(1);
+ MachineOperand &InactiveSrc = MI.getOperand(2);
+
+ // Find implicit register defining lanes active outside WWM.
+ // Note: default here is set to ExecReg so that functional MIR is still
+ // generated if implicit def is not found and assertions are disabled.
+ Register ExecSrcReg = ExecReg;
+ for (auto &Op : MI.implicit_operands()) {
+ if (Op.isDef() || !Op.isReg())
+ continue;
+ Register OpReg = Op.getReg();
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+ OpReg == AMDGPU::SCC)
+ continue;
+ ExecSrcReg = OpReg;
+ break;
+ }
+ assert(ExecSrcReg != ExecReg &&
+ "V_SET_INACTIVE must be in known WWM region");
+
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+ // constant bus constraints and the presence of literal constants
+ // present an issue.
+ // Fallback to V_MOV base lowering in all but the common cases.
+ const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+ const MachineFunction *MF = MBB.getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+ const MCInstrDesc &Desc = get(Opcode);
+
+ const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
+ const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
+ const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
+ const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
+ const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
+ const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+
+ int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+ int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ int ConstantBusUses =
+ 1 + // Starts at 1 for ExecSrcReg
+ (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
+ (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
+ int LiteralConstants =
+ (ActiveSrc.isImm() && !isInlineConstant(ActiveImm) ? 1 : 0) +
+ (InactiveSrc.isImm() && !isInlineConstant(InactiveImm) ? 1 : 0);
+
+ bool UseVCndMask =
+ ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
+ if (VMov64 && UseVCndMask) {
+ // Decomposition must not introduce new literals.
+ UseVCndMask &=
+ ActiveSrc.isReg() ||
+ (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmLo)) ||
+ (!isInlineConstant(ActiveImm));
+ UseVCndMask &= InactiveSrc.isReg() ||
+ (isInlineConstant(InactiveImmLo) &&
+ isInlineConstant(InactiveImmLo)) ||
+ (!isInlineConstant(InactiveImm));
+ }
+
+ if (UseVCndMask && VMov64) {
+ // Dual V_CNDMASK_B32
+ MachineOperand ActiveLo =
+ ActiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0), false,
+ /*isImp=*/false, /*isKill*/ false)
+ : MachineOperand::CreateImm(ActiveImmLo.getSExtValue());
+ MachineOperand ActiveHi =
+ ActiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1), false,
+ /*isImp=*/false, /*isKill*/ ActiveSrc.isKill())
+ : MachineOperand::CreateImm(ActiveImmHi.getSExtValue());
+ MachineOperand InactiveLo =
+ InactiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0), false,
+ /*isImp=*/false, /*isKill*/ false)
+ : MachineOperand::CreateImm(InactiveImmLo.getSExtValue());
+ MachineOperand InactiveHi =
+ InactiveSrc.isReg()
+ ? MachineOperand::CreateReg(
+ RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1), false,
+ /*isImp=*/false, /*isKill*/ InactiveSrc.isKill())
+ : MachineOperand::CreateImm(InactiveImmHi.getSExtValue());
+ BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub0))
+ .addImm(0)
+ .add(InactiveLo)
+ .addImm(0)
+ .add(ActiveLo)
+ .addReg(ExecSrcReg)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub1))
+ .addImm(0)
+ .add(InactiveHi)
+ .addImm(0)
+ .add(ActiveHi)
+ .addReg(ExecSrcReg)
+ .addReg(DstReg, RegState::ImplicitDefine);
+ } else if (UseVCndMask) {
+ // Single V_CNDMASK_B32
+ BuildMI(MBB, MI, DL, get(Opcode), DstReg)
+ .addImm(0)
+ .add(InactiveSrc)
+ .addImm(0)
+ .add(ActiveSrc)
+ .addReg(ExecSrcReg);
+ } else {
+ // Fallback V_MOV case.
+ // Avoid unnecessary work if a source VGPR is also the destination.
+ // This can happen if WWM register allocation was efficient.
+ // Note: this assumes WWM execution.
+ bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
+ bool DstIsInactive =
+ InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
+ if (!DstIsInactive) {
+ // Set exec mask to inactive lanes,
+ // but only if active lanes would be overwritten.
+ if (DstIsActive) {
+ MachineInstr *ExecMI =
+ BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecSrcReg);
+ ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
----------------
arsenm wrote:
Can use setOperandDead on the MachineInstrBuilder, no need to use addRegisterDead. Can just hardcode the implicit reg index
https://github.com/llvm/llvm-project/pull/98864
More information about the llvm-commits
mailing list