[llvm] [AMDGPU] Add rotate/funnel shift pattern matching in instruction selection (PR #149817)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 21 06:51:19 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Aleksandar Spasojevic (aleksandar-amd)
<details>
<summary>Changes</summary>
This patch implements pattern recognition for rotate and funnel shift operations
in instruction selection pass, converting shift+OR sequences back to efficient
V_ALIGNBIT_B32 instructions. Made ROTR and FSHR non-legal to force expansion
into shift sequences, allowing divergence-aware instruction selection to choose
optimal instructions.
---
Patch is 4.45 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149817.diff
70 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+169)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+7-3)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+227)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+1)
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+1-4)
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+517-454)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+169-183)
- (removed) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir (-41)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir (+72-39)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir (+87-15)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir (+65-14)
- (removed) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fshr.mir (-168)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+46-46)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+46-46)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+18682-17264)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+788-686)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+1689-1485)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+147-121)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+4230-3740)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+374-324)
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+438-384)
- (modified) llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll (+33-33)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+20-18)
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+1594-1277)
- (modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+37-32)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+197-181)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+94-88)
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+94-88)
- (modified) llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/build_vector.ll (+3-1)
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+14-14)
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+7-5)
- (modified) llvm/test/CodeGen/AMDGPU/fabs.bf16.ll (+27-23)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+128-124)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+204-192)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+173-165)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+173-165)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+136-128)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll (+51-42)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+8-6)
- (modified) llvm/test/CodeGen/AMDGPU/fneg.bf16.ll (+21-16)
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+10-8)
- (modified) llvm/test/CodeGen/AMDGPU/fshl.ll (+353-244)
- (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+592-420)
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+55-39)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+350-322)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+262-246)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+262-246)
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+200-184)
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+22-26)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+29-24)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll (+25-18)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll (+10-7)
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+383-341)
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+510-357)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+100-92)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+144-136)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+144-136)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+100-92)
- (modified) llvm/test/CodeGen/AMDGPU/packetizer.ll (+3)
- (modified) llvm/test/CodeGen/AMDGPU/permute.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+138-118)
- (modified) llvm/test/CodeGen/AMDGPU/rotate-add.ll (+38-16)
- (modified) llvm/test/CodeGen/AMDGPU/rotl.ll (+136-68)
- (modified) llvm/test/CodeGen/AMDGPU/rotr.ll (+229-48)
- (modified) llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll (+10-8)
- (modified) llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll (+8-7)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 00c7f0eb6e9f1..c61f3a54ec2b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -820,6 +820,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectSTACKRESTORE(N);
return;
}
+ case ISD::OR: {
+ if (SDNode *Selected = selectRotateOrFunnelShiftPattern(N)) {
+ ReplaceNode(N, Selected);
+ return;
+ }
+ break;
+ }
}
SelectCode(N);
@@ -4105,6 +4112,168 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
} while (IsModified);
}
+// Pattern matching for rotate/funnel shift operations
+// and converts them to v_alignbit_b32 instructions
+SDNode *AMDGPUDAGToDAGISel::selectRotateOrFunnelShiftPattern(SDNode *N) {
+ if (N->getOpcode() != ISD::OR)
+ return nullptr;
+
+ // Only handle 32-bit operations
+ if (N->getValueType(0) != MVT::i32)
+ return nullptr;
+
+ if (!N->isDivergent())
+ return nullptr;
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ SDNode *ShlNode = nullptr;
+ SDNode *SrlNode = nullptr;
+
+ // Check both orderings: (shl, srl) and (srl, shl)
+ bool IsLHSShl = LHS.getOpcode() == ISD::SHL;
+ bool IsRHSSrl = RHS.getOpcode() == ISD::SRL;
+ bool IsLHSSrl = LHS.getOpcode() == ISD::SRL;
+ bool IsRHSShl = RHS.getOpcode() == ISD::SHL;
+
+ if ((IsLHSShl && IsRHSSrl) || (IsLHSSrl && IsRHSShl)) {
+ ShlNode = IsLHSShl ? LHS.getNode() : RHS.getNode();
+ SrlNode = IsRHSSrl ? RHS.getNode() : LHS.getNode();
+ } else {
+ return nullptr;
+ }
+
+ // Extract sources and shift amounts
+ SDValue ShlSrc = ShlNode->getOperand(0);
+ SDValue ShlAmt = ShlNode->getOperand(1);
+ SDValue SrlSrc = SrlNode->getOperand(0);
+ SDValue SrlAmt = SrlNode->getOperand(1);
+
+ // Handle the legalizer's (src << 1) pattern for SHL source
+ if (ShlSrc.getOpcode() == ISD::SHL)
+ if (ConstantSDNode *PreShlAmt =
+ dyn_cast<ConstantSDNode>(ShlSrc.getOperand(1)))
+ if (PreShlAmt->getZExtValue() == 1)
+ ShlSrc = ShlSrc.getOperand(0);
+
+ // Helper function to build AlignBit instruction
+ auto buildAlignBitInstruction = [&](SDValue AlignBitSrc0,
+ SDValue AlignBitSrc1,
+ SDValue ShiftAmount) -> SDNode * {
+ SDLoc DL(N);
+
+ // Select opcode based on subtarget features
+ const GCNSubtarget &ST = CurDAG->getSubtarget<GCNSubtarget>();
+ unsigned Opcode =
+ ST.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? (ST.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : ST.hasTrue16BitInsts()
+ ? (ST.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : AMDGPU::V_ALIGNBIT_B32_e64;
+
+ SDValue Ops[8]; // Maximum operands needed
+ unsigned NumOps = 0;
+
+ if (Opcode == AMDGPU::V_ALIGNBIT_B32_t16_e64 ||
+ Opcode == AMDGPU::V_ALIGNBIT_B32_fake16_e64) {
+ // Extended format with modifiers
+ Ops[0] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src0_modifiers
+ Ops[1] = AlignBitSrc0; // src0
+ Ops[2] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src1_modifiers
+ Ops[3] = AlignBitSrc1; // src1
+ Ops[4] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src2_modifiers
+ Ops[5] = ShiftAmount; // src2
+ Ops[6] = CurDAG->getTargetConstant(0, DL, MVT::i32); // clamp
+ Ops[7] = CurDAG->getTargetConstant(0, DL, MVT::i32); // op_sel
+ NumOps = 8;
+ } else {
+ // Regular e64 format
+ Ops[0] = AlignBitSrc0;
+ Ops[1] = AlignBitSrc1;
+ Ops[2] = ShiftAmount;
+ NumOps = 3;
+ }
+
+ return CurDAG->getMachineNode(Opcode, DL, MVT::i32,
+ ArrayRef<SDValue>(Ops, NumOps));
+ };
+
+ // Case 1: Both shift amounts are constants
+ ConstantSDNode *ShlConstant = dyn_cast<ConstantSDNode>(ShlAmt);
+ ConstantSDNode *SrlConstant = dyn_cast<ConstantSDNode>(SrlAmt);
+
+ if (ShlConstant && SrlConstant) {
+ int64_t ShlVal = ShlConstant->getSExtValue();
+ int64_t SrlVal = SrlConstant->getSExtValue();
+
+ if (ShlVal + SrlVal != 32)
+ return nullptr;
+
+ // Create constant for shift amount
+ SDLoc DL(N);
+ SDValue ConstAmtNode = CurDAG->getTargetConstant(SrlVal, DL, MVT::i32);
+
+ return buildAlignBitInstruction(ShlSrc, SrlSrc, ConstAmtNode);
+ }
+
+ // Helper to extract shift amount from (some_value & 31) pattern
+ auto getShiftAmount = [&](SDValue ShiftAmtVal) -> SDValue {
+ if (ShiftAmtVal.getOpcode() == ISD::AND)
+ if (ConstantSDNode *MaskNode =
+ dyn_cast<ConstantSDNode>(ShiftAmtVal.getOperand(1)))
+ if (MaskNode->getZExtValue() == 31)
+ return ShiftAmtVal.getOperand(0);
+
+ return SDValue();
+ };
+
+ // Case 2: Variable shift amounts - check the AND pattern
+ SDValue ShlAmtSrc = getShiftAmount(ShlAmt);
+ SDValue SrlAmtSrc = getShiftAmount(SrlAmt);
+
+ if (!ShlAmtSrc || !SrlAmtSrc)
+ return nullptr;
+
+ // Check if SHL amount comes from NOT or NEG of the original amount
+ SDValue OriginalAmt;
+ bool IsRotatePattern = false;
+
+ if (ShlAmtSrc.getOpcode() == ISD::XOR) {
+ // FSHR pattern: SHL amount = (~original_amt) & 31
+ if (ConstantSDNode *XorMask =
+ dyn_cast<ConstantSDNode>(ShlAmtSrc.getOperand(1))) {
+ if (XorMask->getSExtValue() == -1) {
+ if (ShlAmtSrc.getOperand(0) == SrlAmtSrc) {
+ OriginalAmt = SrlAmtSrc;
+ IsRotatePattern = false;
+ }
+ }
+ }
+ } else if (ShlAmtSrc.getOpcode() == ISD::SUB) {
+ // ROTR pattern: SHL amount = (-original_amt) & 31 = (0 - original_amt) & 31
+ if (ConstantSDNode *SubLHS =
+ dyn_cast<ConstantSDNode>(ShlAmtSrc.getOperand(0))) {
+ if (SubLHS->getZExtValue() == 0) {
+ if (ShlAmtSrc.getOperand(1) == SrlAmtSrc) {
+ OriginalAmt = SrlAmtSrc;
+ IsRotatePattern = true;
+ }
+ }
+ }
+ }
+
+ if (!OriginalAmt)
+ return nullptr;
+
+ SDValue AlignBitSrc0 = ShlSrc;
+ SDValue AlignBitSrc1 = IsRotatePattern ? ShlSrc : SrlSrc;
+
+ return buildAlignBitInstruction(AlignBitSrc0, AlignBitSrc1, OriginalAmt);
+}
+
AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
CodeGenOptLevel OptLevel)
: SelectionDAGISelLegacy(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index acbab3d9e2d81..b73259054d581 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -284,6 +284,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
void SelectINTRINSIC_VOID(SDNode *N);
void SelectWAVE_ADDRESS(SDNode *N);
void SelectSTACKRESTORE(SDNode *N);
+ SDNode *selectRotateOrFunnelShiftPattern(SDNode *N);
protected:
// Include the pieces autogenerated from the target description.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b037cdd5393ea..49d122a91c7e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -486,12 +486,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
}
- // The hardware supports 32-bit FSHR, but not FSHL.
- setOperationAction(ISD::FSHR, MVT::i32, Legal);
+ if (Subtarget->isGCN()) {
+ setOperationAction(ISD::FSHR, MVT::i32, Expand);
+ setOperationAction(ISD::ROTR, {MVT::i32, MVT::i64}, Expand);
+ } else {
+ setOperationAction(ISD::FSHR, MVT::i32, Legal);
+ setOperationAction(ISD::ROTR, {MVT::i32, MVT::i64}, Legal);
+ }
// The hardware supports 32-bit ROTR, but not ROTL.
setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
- setOperationAction(ISD::ROTR, MVT::i64, Expand);
setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8975486caa770..78506d8976f22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -406,6 +406,231 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
+bool AMDGPUInstructionSelector::selectRotateOrFunnelShiftPattern(
+ MachineInstr &I) const {
+ Register DstReg = I.getOperand(0).getReg();
+ Register LHS = I.getOperand(1).getReg();
+ Register RHS = I.getOperand(2).getReg();
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ if (!IsVALU)
+ return false;
+
+ // Check if this is a 32-bit operation
+ if (MRI->getType(DstReg).getSizeInBits() != 32)
+ return false;
+
+ MachineInstr *LHSInst = getDefIgnoringCopies(LHS, *MRI);
+ MachineInstr *RHSInst = getDefIgnoringCopies(RHS, *MRI);
+
+ MachineInstr *ShlInst = nullptr;
+ MachineInstr *SrlInst = nullptr;
+
+ // Check both orderings: (shl, srl) and (srl, shl)
+ bool IsLHSShl = LHSInst->getOpcode() == TargetOpcode::G_SHL;
+ bool IsRHSSrl = RHSInst->getOpcode() == TargetOpcode::G_LSHR;
+ bool IsLHSSrl = LHSInst->getOpcode() == TargetOpcode::G_LSHR;
+ bool IsRHSShl = RHSInst->getOpcode() == TargetOpcode::G_SHL;
+
+ if ((IsLHSShl && IsRHSSrl) || (IsLHSSrl && IsRHSShl)) {
+ ShlInst = IsLHSShl ? LHSInst : RHSInst;
+ SrlInst = IsRHSSrl ? RHSInst : LHSInst;
+ } else
+ return false;
+
+ // Extract the base sources, handling the legalizer's (src << 1) pattern
+ Register ShlSrc = ShlInst->getOperand(1).getReg();
+ Register SrlSrc = SrlInst->getOperand(1).getReg();
+
+ // Check if SHL source comes from (original_src << 1)
+ MachineInstr *PreShlInst = getDefIgnoringCopies(ShlSrc, *MRI);
+ if (PreShlInst && PreShlInst->getOpcode() == TargetOpcode::G_SHL) {
+ std::optional<ValueAndVReg> PreShlAmt = getIConstantVRegValWithLookThrough(
+ PreShlInst->getOperand(2).getReg(), *MRI);
+ if (PreShlAmt && PreShlAmt->Value.getZExtValue() == 1)
+ ShlSrc = PreShlInst->getOperand(1).getReg();
+ }
+ // Helper function to build AlignBit instruction
+ auto buildAlignBitInstruction = [&](Register AlignBitSrc0,
+ Register AlignBitSrc1,
+ Register ShiftAmount) -> bool {
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ // Select opcode based on subtarget features
+ unsigned Opcode =
+ STI.getGeneration() >= AMDGPUSubtarget::GFX11
+ ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : STI.hasTrue16BitInsts()
+ ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+ : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+ : AMDGPU::V_ALIGNBIT_B32_e64;
+
+ // Check constant bus restriction and copy SGPRs to VGPRs if needed
+ unsigned ConstantBusLimit = STI.getConstantBusLimit(Opcode);
+ unsigned SGPRCount = 0;
+
+ Register AlignBitSrc0ToUse = AlignBitSrc0;
+ Register AlignBitSrc1ToUse = AlignBitSrc1;
+ Register ShiftAmountToUse = ShiftAmount;
+
+ // Count SGPR operands
+ SGPRCount += (RBI.getRegBank(AlignBitSrc0, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID)
+ ? 1
+ : 0;
+ SGPRCount += (RBI.getRegBank(AlignBitSrc1, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID)
+ ? 1
+ : 0;
+ SGPRCount += (RBI.getRegBank(ShiftAmount, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID)
+ ? 1
+ : 0;
+
+ // If we exceed the constant bus limit, copy SGPRs to VGPRs
+ if (SGPRCount > ConstantBusLimit) {
+ auto copyToVGPRIfNeeded = [&](Register &RegToUse, Register OrigReg) {
+ if (RBI.getRegBank(OrigReg, *MRI, TRI)->getID() ==
+ AMDGPU::SGPRRegBankID &&
+ SGPRCount > ConstantBusLimit) {
+ RegToUse = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_MOV_B32_e32), RegToUse)
+ .addReg(OrigReg);
+ SGPRCount--;
+ }
+ };
+
+ copyToVGPRIfNeeded(AlignBitSrc0ToUse, AlignBitSrc0);
+ copyToVGPRIfNeeded(AlignBitSrc1ToUse, AlignBitSrc1);
+ copyToVGPRIfNeeded(ShiftAmountToUse, ShiftAmount);
+ }
+
+ auto AlignBit = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg);
+
+ if (Opcode == AMDGPU::V_ALIGNBIT_B32_t16_e64 ||
+ Opcode == AMDGPU::V_ALIGNBIT_B32_fake16_e64) {
+ // t16/fake16 variants have extended operand format
+ AlignBit
+ .addImm(0) // src0_modifiers
+ .addReg(AlignBitSrc0ToUse) // src0
+ .addImm(0) // src1_modifiers
+ .addReg(AlignBitSrc1ToUse) // src1
+ .addImm(0) // src2_modifiers
+ .addReg(ShiftAmountToUse) // src2
+ .addImm(0) // clamp
+ .addImm(0); // op_sel
+ } else {
+ AlignBit.addReg(AlignBitSrc0ToUse)
+ .addReg(AlignBitSrc1ToUse)
+ .addReg(ShiftAmountToUse);
+ }
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*AlignBit, TII, TRI, RBI);
+ };
+
+ // Get shift amounts for both SHL and SRL
+ Register ShlAmtReg = ShlInst->getOperand(2).getReg();
+ Register SrlAmtReg = SrlInst->getOperand(2).getReg();
+
+ // Case 1: Both shift amounts are constants (may be through COPY instructions)
+ auto ShlConstVal = getIConstantVRegValWithLookThrough(ShlAmtReg, *MRI);
+ auto SrlConstVal = getIConstantVRegValWithLookThrough(SrlAmtReg, *MRI);
+
+ if (ShlConstVal && SrlConstVal) {
+ int64_t ShlVal = ShlConstVal->Value.getSExtValue();
+ int64_t SrlVal = SrlConstVal->Value.getSExtValue();
+
+ if (ShlVal + SrlVal != 32)
+ return false;
+
+ // Create a constant register for the original shift amount (SRL amount)
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ Register ConstAmtReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), ConstAmtReg)
+ .addImm(SrlVal);
+
+ return buildAlignBitInstruction(ShlSrc, SrlSrc, ConstAmtReg);
+ }
+
+ // Helper to extract shift amount from (some_value & 31) pattern
+ auto getShiftAmount = [&](Register ShiftAmtReg) -> std::optional<Register> {
+ MachineInstr *AndInst = getDefIgnoringCopies(ShiftAmtReg, *MRI);
+ if (AndInst && AndInst->getOpcode() == TargetOpcode::G_AND) {
+ Register AndSrc = AndInst->getOperand(1).getReg();
+ Register AndMask = AndInst->getOperand(2).getReg();
+
+ std::optional<ValueAndVReg> MaskVal =
+ getIConstantVRegValWithLookThrough(AndMask, *MRI);
+ if (MaskVal && MaskVal->Value.getZExtValue() == 31) {
+ return AndSrc;
+ }
+ }
+ return std::nullopt;
+ };
+
+ // Case 2: Variable shift amounts - check the AND/XOR pattern
+ auto ShlAmtSrc = getShiftAmount(ShlAmtReg);
+ auto SrlAmtSrc = getShiftAmount(SrlAmtReg);
+
+ if (!ShlAmtSrc || !SrlAmtSrc)
+ return false;
+
+ MachineInstr *ShlSrcInst = getDefIgnoringCopies(*ShlAmtSrc, *MRI);
+ if (!ShlSrcInst)
+ return false;
+
+ Register OriginalAmt;
+ bool IsRotatePattern = false;
+
+ if (ShlSrcInst->getOpcode() == TargetOpcode::G_XOR) {
+ // FSHR pattern: SHL amount = (~original_amt) & 31
+ Register XorSrc = ShlSrcInst->getOperand(1).getReg();
+ Register XorMask = ShlSrcInst->getOperand(2).getReg();
+
+ std::optional<ValueAndVReg> XorMaskVal =
+ getIConstantVRegValWithLookThrough(XorMask, *MRI);
+ if (!XorMaskVal || XorMaskVal->Value.getSExtValue() != -1)
+ return false;
+
+ if (XorSrc != *SrlAmtSrc)
+ return false;
+
+ OriginalAmt = *SrlAmtSrc;
+ IsRotatePattern = false;
+
+ } else if (ShlSrcInst->getOpcode() == TargetOpcode::G_SUB) {
+ // ROTR pattern: SHL amount = (-original_amt) & 31 = (0 - original_amt) & 31
+ Register SubLHS = ShlSrcInst->getOperand(1).getReg();
+ Register SubRHS = ShlSrcInst->getOperand(2).getReg();
+
+ std::optional<ValueAndVReg> SubLHSVal =
+ getIConstantVRegValWithLookThrough(SubLHS, *MRI);
+ if (!SubLHSVal || SubLHSVal->Value.getZExtValue() != 0)
+ return false;
+
+ if (SubRHS != *SrlAmtSrc)
+ return false;
+
+ OriginalAmt = *SrlAmtSrc;
+ IsRotatePattern = true;
+
+ } else
+ return false;
+
+ // Build V_ALIGNBIT_B32 instruction
+ Register AlignBitSrc0 = ShlSrc;
+ Register AlignBitSrc1 = IsRotatePattern ? ShlSrc : SrlSrc;
+ Register VarShiftAmount = OriginalAmt;
+
+ return buildAlignBitInstruction(AlignBitSrc0, AlignBitSrc1, VarShiftAmount);
+}
+
bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
@@ -4033,6 +4258,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_XOR:
if (selectBITOP3(I))
return true;
+ if (I.getOpcode() == TargetOpcode::G_OR && selectRotateOrFunnelShiftPattern(I))
+ return true;
if (selectImpl(I, *CoverageInfo))
return true;
return selectG_AND_OR_XOR(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 34bdf0a6d4ab2..46cdf813330b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -97,6 +97,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool selectG_FNEG(MachineInstr &I) const;
bool selectG_FABS(MachineInstr &I) const;
bool selectG_AND_OR_XOR(MachineInstr &I) const;
+ bool selectRotateOrFunnelShiftPattern(MachineInstr &I) const;
bool selectG_ADD_SUB(MachineInstr &I) const;
bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const;
bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7bf88d2ee5b6..b1b19332d870c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2041,13 +2041,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S32, S64)
.lower();
- getActionDefinitionsBuilder({G_ROTR, G_ROTL})
- .scalarize(0)
- .lower();
+ getActionDefinitionsBuilder({G_ROTR, G_ROTL}).scalarize(0).lower();
// TODO: Only Try to form v2s16 with legal packed instructions.
getActionDefinitionsBuilder(G_FSHR)
- .legalFor({{S32, S32}})
.lowerFor({{V2S16, V2S16}})
.clampMaxNumElementsStrict(0, S16, 2)
.scalarize(0)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b54cccead9781..a280b84a4667b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4089,6 +4089,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_SMED3:
case AMDGPU::G_AMDGPU_FMED3:
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_ROTR:
+ case AMDGPU::G_ROTL: {
+ if (isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e98..3e65697c07450 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1768,102 +1768,102 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/149817
More information about the llvm-commits
mailing list