[llvm] [AMDGPU] V_SET_INACTIVE optimizations (PR #98864)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Sep 3 22:21:56 PDT 2024
================
@@ -2273,37 +2287,159 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_SET_INACTIVE_B32: {
- unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
- MI.eraseFromParent();
- break;
- }
+ case AMDGPU::V_SET_INACTIVE_B32:
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- expandPostRAPseudo(*Copy);
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- expandPostRAPseudo(*Copy);
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+ ? AMDGPU::V_MOV_B64_PSEUDO
+ : AMDGPU::V_MOV_B32_e32;
+ Register ExecReg = RI.getExec();
+ Register DstReg = MI.getOperand(0).getReg();
+ MachineOperand &ActiveSrc = MI.getOperand(1);
+ MachineOperand &InactiveSrc = MI.getOperand(2);
+
+ // Find implicit register defining lanes active outside WWM.
+ Register ExecSrcReg = findSetInactiveMask(MI);
+ assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
+ // Note: default here is set to ExecReg so that functional MIR is still
+ // generated if implicit def is not found and assertions are disabled.
+ if (!ExecSrcReg)
+ ExecSrcReg = ExecReg;
+
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+ // constant bus constraints and the presence of literal constants
+ // present an issue.
+ // Fallback to V_MOV base lowering in all but the common cases.
+ const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+ const MachineFunction *MF = MBB.getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+ const MCInstrDesc &Desc = get(Opcode);
+
+ const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
+ const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
+ const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
+ const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
+ const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
+ const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+
+ int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+ int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ int ConstantBusUses =
+ 1 + // Starts at 1 for ExecSrcReg
+ (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
+ (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
+ int LiteralConstants =
+ ((ActiveSrc.isReg() ||
+ (ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
+ ? 0
+ : 1) +
+ ((InactiveSrc.isReg() ||
+ (InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
+ ? 0
+ : 1);
+
+ bool UseVCndMask =
+ ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
+ if (VMov64 && UseVCndMask) {
+ // Decomposition must not introduce new literals.
+ UseVCndMask &=
+ ActiveSrc.isReg() ||
+ (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
+ (!isInlineConstant(ActiveImm));
+ UseVCndMask &= InactiveSrc.isReg() ||
+ (isInlineConstant(InactiveImmLo) &&
+ isInlineConstant(InactiveImmHi)) ||
+ (!isInlineConstant(InactiveImm));
+ }
+
+ if (UseVCndMask && VMov64) {
+ // Dual V_CNDMASK_B32
+ MachineOperand ActiveLo =
----------------
arsenm wrote:
It would be nicer to extract this into some variant of buildExtractSubRegOrImm that handles physical registers
https://github.com/llvm/llvm-project/pull/98864
More information about the llvm-commits
mailing list