[llvm] [AMDGPU] V_SET_INACTIVE optimizations (PR #98864)
Diana Picus via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 29 02:51:11 PDT 2024
================
@@ -2273,37 +2273,162 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
- case AMDGPU::V_SET_INACTIVE_B32: {
- unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
- // optimizations (mainly Register Coalescer) aware of WWM register liveness.
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
- MI.eraseFromParent();
- break;
- }
+ case AMDGPU::V_SET_INACTIVE_B32:
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
- unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(1));
- expandPostRAPseudo(*Copy);
- auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
- FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
- Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
- MI.getOperand(0).getReg())
- .add(MI.getOperand(2));
- expandPostRAPseudo(*Copy);
- BuildMI(MBB, MI, DL, get(NotOpc), Exec)
- .addReg(Exec);
+ unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+ ? AMDGPU::V_MOV_B64_PSEUDO
+ : AMDGPU::V_MOV_B32_e32;
+ Register ExecReg = RI.getExec();
+ Register DstReg = MI.getOperand(0).getReg();
+ MachineOperand &ActiveSrc = MI.getOperand(1);
+ MachineOperand &InactiveSrc = MI.getOperand(2);
+
+ // Find implicit register defining lanes active outside WWM.
+ // Note: default here is set to ExecReg so that functional MIR is still
+ // generated if implicit def is not found and assertions are disabled.
+ Register ExecSrcReg = ExecReg;
+ for (auto &Op : MI.implicit_operands()) {
+ if (Op.isDef() || !Op.isReg())
+ continue;
+ Register OpReg = Op.getReg();
+ if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+ OpReg == AMDGPU::SCC)
+ continue;
+ ExecSrcReg = OpReg;
+ break;
+ }
+ assert(ExecSrcReg != ExecReg &&
+ "V_SET_INACTIVE must be in known WWM region");
+
+ // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+ // constant bus constraints and the presence of literal constants
+ // present an issue.
+ // Fallback to V_MOV base lowering in all but the common cases.
+ const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+ const MachineFunction *MF = MBB.getParent();
+ const MachineRegisterInfo &MRI = MF->getRegInfo();
+ const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+ const MCInstrDesc &Desc = get(Opcode);
+
+ const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
+ const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
+ const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
+ const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
+ const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
+ const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
+
+ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+
+ int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+ int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+ int ConstantBusUses =
+ 1 + // Starts at 1 for ExecSrcReg
+ (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
+ (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
+ int LiteralConstants =
+ (ActiveSrc.isImm() && !isInlineConstant(ActiveImm) ? 1 : 0) +
+ (InactiveSrc.isImm() && !isInlineConstant(InactiveImm) ? 1 : 0);
+
+ bool UseVCndMask =
+ ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
+ if (VMov64 && UseVCndMask) {
+ // Decomposition must not introduce new literals.
+ UseVCndMask &=
+ ActiveSrc.isReg() ||
+ (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmLo)) ||
+ (!isInlineConstant(ActiveImm));
+ UseVCndMask &= InactiveSrc.isReg() ||
+ (isInlineConstant(InactiveImmLo) &&
+ isInlineConstant(InactiveImmLo)) ||
----------------
rovka wrote:
```suggestion
isInlineConstant(InactiveImmHi)) ||
```
https://github.com/llvm/llvm-project/pull/98864
More information about the llvm-commits
mailing list