[llvm] [AMDGPU] V_SET_INACTIVE optimizations (PR #98864)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 29 01:59:37 PDT 2024


================
@@ -2273,37 +2273,162 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
-  case AMDGPU::V_SET_INACTIVE_B32: {
-    unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
-    // optimizations (mainly Register Coalescer) aware of WWM register liveness.
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
-        .add(MI.getOperand(1));
-    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
-    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
-      .add(MI.getOperand(2));
-    BuildMI(MBB, MI, DL, get(NotOpc), Exec)
-      .addReg(Exec);
-    MI.eraseFromParent();
-    break;
-  }
+  case AMDGPU::V_SET_INACTIVE_B32:
   case AMDGPU::V_SET_INACTIVE_B64: {
     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-    MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
-                                 MI.getOperand(0).getReg())
-                             .add(MI.getOperand(1));
-    expandPostRAPseudo(*Copy);
-    auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
-    FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
-    Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
-                   MI.getOperand(0).getReg())
-               .add(MI.getOperand(2));
-    expandPostRAPseudo(*Copy);
-    BuildMI(MBB, MI, DL, get(NotOpc), Exec)
-      .addReg(Exec);
+    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+    unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
+                           ? AMDGPU::V_MOV_B64_PSEUDO
+                           : AMDGPU::V_MOV_B32_e32;
+    Register ExecReg = RI.getExec();
+    Register DstReg = MI.getOperand(0).getReg();
+    MachineOperand &ActiveSrc = MI.getOperand(1);
+    MachineOperand &InactiveSrc = MI.getOperand(2);
+
+    // Find implicit register defining lanes active outside WWM.
+    // Note: default here is set to ExecReg so that functional MIR is still
+    // generated if implicit def is not found and assertions are disabled.
+    Register ExecSrcReg = ExecReg;
+    for (auto &Op : MI.implicit_operands()) {
+      if (Op.isDef() || !Op.isReg())
+        continue;
+      Register OpReg = Op.getReg();
+      if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
+          OpReg == AMDGPU::SCC)
+        continue;
+      ExecSrcReg = OpReg;
+      break;
+    }
+    assert(ExecSrcReg != ExecReg &&
+           "V_SET_INACTIVE must be in known WWM region");
+
+    // Ideally in WWM this operation is lowered to V_CNDMASK; however,
+    // constant bus constraints and the presence of literal constants
+    // present an issue.
+    // Fallback to V_MOV base lowering in all but the common cases.
+    const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
+    const MachineFunction *MF = MBB.getParent();
+    const MachineRegisterInfo &MRI = MF->getRegInfo();
+    const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
+    const MCInstrDesc &Desc = get(Opcode);
+
+    const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
+    const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
+    const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
+    const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
+    const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
+    const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
+
+    int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+    int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+
+    int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
+    int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
+    int ConstantBusUses =
+        1 + // Starts at 1 for ExecSrcReg
+        (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
+        (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
+    int LiteralConstants =
+        (ActiveSrc.isImm() && !isInlineConstant(ActiveImm) ? 1 : 0) +
+        (InactiveSrc.isImm() && !isInlineConstant(InactiveImm) ? 1 : 0);
+
+    bool UseVCndMask =
+        ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
+    if (VMov64 && UseVCndMask) {
+      // Decomposition must not introduce new literals.
+      UseVCndMask &=
+          ActiveSrc.isReg() ||
+          (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmLo)) ||
+          (!isInlineConstant(ActiveImm));
+      UseVCndMask &= InactiveSrc.isReg() ||
+                     (isInlineConstant(InactiveImmLo) &&
+                      isInlineConstant(InactiveImmLo)) ||
+                     (!isInlineConstant(InactiveImm));
+    }
+
+    if (UseVCndMask && VMov64) {
+      // Dual V_CNDMASK_B32
+      MachineOperand ActiveLo =
+          ActiveSrc.isReg()
+              ? MachineOperand::CreateReg(
+                    RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub0), false,
+                    /*isImp=*/false, /*isKill*/ false)
+              : MachineOperand::CreateImm(ActiveImmLo.getSExtValue());
+      MachineOperand ActiveHi =
+          ActiveSrc.isReg()
+              ? MachineOperand::CreateReg(
+                    RI.getSubReg(ActiveSrc.getReg(), AMDGPU::sub1), false,
+                    /*isImp=*/false, /*isKill*/ ActiveSrc.isKill())
+              : MachineOperand::CreateImm(ActiveImmHi.getSExtValue());
+      MachineOperand InactiveLo =
+          InactiveSrc.isReg()
+              ? MachineOperand::CreateReg(
+                    RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub0), false,
+                    /*isImp=*/false, /*isKill*/ false)
+              : MachineOperand::CreateImm(InactiveImmLo.getSExtValue());
+      MachineOperand InactiveHi =
+          InactiveSrc.isReg()
+              ? MachineOperand::CreateReg(
+                    RI.getSubReg(InactiveSrc.getReg(), AMDGPU::sub1), false,
+                    /*isImp=*/false, /*isKill*/ InactiveSrc.isKill())
+              : MachineOperand::CreateImm(InactiveImmHi.getSExtValue());
+      BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub0))
+          .addImm(0)
+          .add(InactiveLo)
+          .addImm(0)
+          .add(ActiveLo)
+          .addReg(ExecSrcReg)
+          .addReg(DstReg, RegState::ImplicitDefine);
+      BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DstReg, AMDGPU::sub1))
+          .addImm(0)
+          .add(InactiveHi)
+          .addImm(0)
+          .add(ActiveHi)
+          .addReg(ExecSrcReg)
+          .addReg(DstReg, RegState::ImplicitDefine);
+    } else if (UseVCndMask) {
+      // Single V_CNDMASK_B32
+      BuildMI(MBB, MI, DL, get(Opcode), DstReg)
+          .addImm(0)
+          .add(InactiveSrc)
+          .addImm(0)
+          .add(ActiveSrc)
+          .addReg(ExecSrcReg);
+    } else {
+      // Fallback V_MOV case.
+      // Avoid unnecessary work if a source VGPR is also the destination.
+      // This can happen if WWM register allocation was efficient.
+      // Note: this assumes WWM execution.
+      bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
+      bool DstIsInactive =
+          InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
+      if (!DstIsInactive) {
+        // Set exec mask to inactive lanes,
+        // but only if active lanes would be overwritten.
+        if (DstIsActive) {
+          MachineInstr *ExecMI =
+              BuildMI(MBB, MI, DL, get(NotOpc), ExecReg).addReg(ExecSrcReg);
+          ExecMI->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
----------------
arsenm wrote:

Can use setOperandDead on the MachineInstrBuilder, no need to use addRegisterDead. Can just hardcode the implicit reg index

https://github.com/llvm/llvm-project/pull/98864


More information about the llvm-commits mailing list