[llvm] [AMDGPU] Optimize rotate/funnel shift pattern matching in instruction selection (PR #149817)
Juan Manuel Martinez CaamaƱo via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 31 03:05:22 PDT 2025
================
@@ -406,6 +406,231 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+bool AMDGPUInstructionSelector::selectRotateOrFunnelShiftPattern(
+ MachineInstr &I) const {
+ Register DstReg = I.getOperand(0).getReg();
+ Register LHS = I.getOperand(1).getReg();
+ Register RHS = I.getOperand(2).getReg();
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ if (!IsVALU)
+ return false;
+
+ // Check if this is a 32-bit operation
+ if (MRI->getType(DstReg).getSizeInBits() != 32)
+ return false;
+
+ MachineInstr *LHSInst = getDefIgnoringCopies(LHS, *MRI);
+ MachineInstr *RHSInst = getDefIgnoringCopies(RHS, *MRI);
+
+ MachineInstr *ShlInst = nullptr;
+ MachineInstr *SrlInst = nullptr;
+
+ // Check both orderings: (shl, srl) and (srl, shl)
+ bool IsLHSShl = LHSInst->getOpcode() == TargetOpcode::G_SHL;
+ bool IsRHSSrl = RHSInst->getOpcode() == TargetOpcode::G_LSHR;
+ bool IsLHSSrl = LHSInst->getOpcode() == TargetOpcode::G_LSHR;
+ bool IsRHSShl = RHSInst->getOpcode() == TargetOpcode::G_SHL;
+
+ if ((IsLHSShl && IsRHSSrl) || (IsLHSSrl && IsRHSShl)) {
+ ShlInst = IsLHSShl ? LHSInst : RHSInst;
+ SrlInst = IsRHSSrl ? RHSInst : LHSInst;
+ } else
+ return false;
+
+ // Extract the base sources, handling the legalizer's (src << 1) pattern
+ Register ShlSrc = ShlInst->getOperand(1).getReg();
+ Register SrlSrc = SrlInst->getOperand(1).getReg();
+
+ // Check if SHL source comes from (original_src << 1)
+ MachineInstr *PreShlInst = getDefIgnoringCopies(ShlSrc, *MRI);
+ if (PreShlInst && PreShlInst->getOpcode() == TargetOpcode::G_SHL) {
+ std::optional<ValueAndVReg> PreShlAmt = getIConstantVRegValWithLookThrough(
+ PreShlInst->getOperand(2).getReg(), *MRI);
+ if (PreShlAmt && PreShlAmt->Value.getZExtValue() == 1)
+ ShlSrc = PreShlInst->getOperand(1).getReg();
+ }
+ // Helper function to build AlignBit instruction
+ auto buildAlignBitInstruction = [&](Register AlignBitSrc0,
+ Register AlignBitSrc1,
+ Register ShiftAmount) -> bool {
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ // Select opcode based on subtarget features
+ unsigned Opcode =
+ STI.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : STI.hasTrue16BitInsts()
+ ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : AMDGPU::V_ALIGNBIT_B32_e64;
+
+ // Check constant bus restriction and copy SGPRs to VGPRs if needed
+ unsigned ConstantBusLimit = STI.getConstantBusLimit(Opcode);
+ unsigned SGPRCount = 0;
+
+ Register AlignBitSrc0ToUse = AlignBitSrc0;
+ Register AlignBitSrc1ToUse = AlignBitSrc1;
+ Register ShiftAmountToUse = ShiftAmount;
+
+ // Count SGPR operands
+ SGPRCount += (RBI.getRegBank(AlignBitSrc0, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID)
+ ? 1
+ : 0;
+ SGPRCount += (RBI.getRegBank(AlignBitSrc1, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID)
+ ? 1
+ : 0;
+ SGPRCount += (RBI.getRegBank(ShiftAmount, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID)
+ ? 1
+ : 0;
----------------
jmmartinez wrote:
```suggestion
for(Register R : {AlignBitSrc0, AlignBitSrc1, ShiftAmount}) {
SGPRCount += RBI.getRegBank(R, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
}
```
Or even use `llvm::count_if`:
```cpp
auto IsScalarReg = [&](Register R) { return RBI.getRegBank(R, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID; }
unsigned SGPRCount = count_if({AlignBitSrc0, AlignBitSrc1, ShiftAmount}, IsScalarReg);
```
https://github.com/llvm/llvm-project/pull/149817
More information about the llvm-commits
mailing list