[llvm] [AMDGPU] Add rotate/funnel shift pattern matching in instruction selection (PR #149817)

Mon Jul 21 06:51:19 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Aleksandar Spasojevic (aleksandar-amd)

<details>
<summary>Changes</summary>

This patch implements pattern recognition for rotate and funnel shift operations
in instruction selection pass, converting shift+OR sequences back to efficient
V_ALIGNBIT_B32 instructions. Made ROTR and FSHR non-legal to force expansion 
into shift sequences, allowing divergence-aware instruction selection to choose
optimal instructions. 

---

Patch is 4.45 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149817.diff


70 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+169) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (+7-3) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+227) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+1) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+1-4) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+6) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll (+517-454) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll (+169-183) 
- (removed) llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir (-41) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir (+72-39) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir (+87-15) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir (+65-14) 
- (removed) llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fshr.mir (-168) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll (+46-46) 
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll (+46-46) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+18682-17264) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll (+788-686) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll (+1689-1485) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll (+147-121) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll (+18-18) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+4230-3740) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll (+374-324) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll (+438-384) 
- (modified) llvm/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll (+33-33) 
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+20-18) 
- (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+1594-1277) 
- (modified) llvm/test/CodeGen/AMDGPU/bswap.ll (+37-32) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll (+197-181) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll (+94-88) 
- (modified) llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll (+94-88) 
- (modified) llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/build_vector.ll (+3-1) 
- (modified) llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll (+14-14) 
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll (+7-5) 
- (modified) llvm/test/CodeGen/AMDGPU/fabs.bf16.ll (+27-23) 
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll (+128-124) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+204-192) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+173-165) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+173-165) 
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+136-128) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll (+51-42) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+8-6) 
- (modified) llvm/test/CodeGen/AMDGPU/fneg.bf16.ll (+21-16) 
- (modified) llvm/test/CodeGen/AMDGPU/freeze.ll (+10-8) 
- (modified) llvm/test/CodeGen/AMDGPU/fshl.ll (+353-244) 
- (modified) llvm/test/CodeGen/AMDGPU/fshr.ll (+592-420) 
- (modified) llvm/test/CodeGen/AMDGPU/function-args.ll (+55-39) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll (+350-322) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll (+262-246) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll (+262-246) 
- (modified) llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll (+200-184) 
- (modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+22-26) 
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll (+29-24) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll (+25-18) 
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll (+10-7) 
- (modified) llvm/test/CodeGen/AMDGPU/load-constant-i8.ll (+383-341) 
- (modified) llvm/test/CodeGen/AMDGPU/load-global-i8.ll (+510-357) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll (+100-92) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll (+144-136) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll (+144-136) 
- (modified) llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll (+100-92) 
- (modified) llvm/test/CodeGen/AMDGPU/packetizer.ll (+3) 
- (modified) llvm/test/CodeGen/AMDGPU/permute.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/permute_i8.ll (+138-118) 
- (modified) llvm/test/CodeGen/AMDGPU/rotate-add.ll (+38-16) 
- (modified) llvm/test/CodeGen/AMDGPU/rotl.ll (+136-68) 
- (modified) llvm/test/CodeGen/AMDGPU/rotr.ll (+229-48) 
- (modified) llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll (+10-8) 
- (modified) llvm/test/Transforms/InferAddressSpaces/SPIRV/generic-cast-explicit.ll (+8-7) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 00c7f0eb6e9f1..c61f3a54ec2b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -820,6 +820,13 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectSTACKRESTORE(N);
     return;
   }
+  case ISD::OR: {
+    if (SDNode *Selected = selectRotateOrFunnelShiftPattern(N)) {
+      ReplaceNode(N, Selected);
+      return;
+    }
+    break;
+  }
   }
 
   SelectCode(N);
@@ -4105,6 +4112,168 @@ void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   } while (IsModified);
 }
 
+// Pattern matching for rotate/funnel shift operations
+// and converts them to v_alignbit_b32 instructions
+SDNode *AMDGPUDAGToDAGISel::selectRotateOrFunnelShiftPattern(SDNode *N) {
+  if (N->getOpcode() != ISD::OR)
+    return nullptr;
+
+  // Only handle 32-bit operations
+  if (N->getValueType(0) != MVT::i32)
+    return nullptr;
+
+  if (!N->isDivergent())
+    return nullptr;
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  SDNode *ShlNode = nullptr;
+  SDNode *SrlNode = nullptr;
+
+  // Check both orderings: (shl, srl) and (srl, shl)
+  bool IsLHSShl = LHS.getOpcode() == ISD::SHL;
+  bool IsRHSSrl = RHS.getOpcode() == ISD::SRL;
+  bool IsLHSSrl = LHS.getOpcode() == ISD::SRL;
+  bool IsRHSShl = RHS.getOpcode() == ISD::SHL;
+
+  if ((IsLHSShl && IsRHSSrl) || (IsLHSSrl && IsRHSShl)) {
+    ShlNode = IsLHSShl ? LHS.getNode() : RHS.getNode();
+    SrlNode = IsRHSSrl ? RHS.getNode() : LHS.getNode();
+  } else {
+    return nullptr;
+  }
+
+  // Extract sources and shift amounts
+  SDValue ShlSrc = ShlNode->getOperand(0);
+  SDValue ShlAmt = ShlNode->getOperand(1);
+  SDValue SrlSrc = SrlNode->getOperand(0);
+  SDValue SrlAmt = SrlNode->getOperand(1);
+
+  // Handle the legalizer's (src << 1) pattern for SHL source
+  if (ShlSrc.getOpcode() == ISD::SHL)
+    if (ConstantSDNode *PreShlAmt =
+            dyn_cast<ConstantSDNode>(ShlSrc.getOperand(1)))
+      if (PreShlAmt->getZExtValue() == 1)
+        ShlSrc = ShlSrc.getOperand(0);
+
+  // Helper function to build AlignBit instruction
+  auto buildAlignBitInstruction = [&](SDValue AlignBitSrc0,
+                                      SDValue AlignBitSrc1,
+                                      SDValue ShiftAmount) -> SDNode * {
+    SDLoc DL(N);
+
+    // Select opcode based on subtarget features
+    const GCNSubtarget &ST = CurDAG->getSubtarget<GCNSubtarget>();
+    unsigned Opcode =
+        ST.getGeneration() >= AMDGPUSubtarget::GFX11
+            ? (ST.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+                                       : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+        : ST.hasTrue16BitInsts()
+            ? (ST.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+                                       : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+            : AMDGPU::V_ALIGNBIT_B32_e64;
+
+    SDValue Ops[8]; // Maximum operands needed
+    unsigned NumOps = 0;
+
+    if (Opcode == AMDGPU::V_ALIGNBIT_B32_t16_e64 ||
+        Opcode == AMDGPU::V_ALIGNBIT_B32_fake16_e64) {
+      // Extended format with modifiers
+      Ops[0] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src0_modifiers
+      Ops[1] = AlignBitSrc0;                               // src0
+      Ops[2] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src1_modifiers
+      Ops[3] = AlignBitSrc1;                               // src1
+      Ops[4] = CurDAG->getTargetConstant(0, DL, MVT::i32); // src2_modifiers
+      Ops[5] = ShiftAmount;                                // src2
+      Ops[6] = CurDAG->getTargetConstant(0, DL, MVT::i32); // clamp
+      Ops[7] = CurDAG->getTargetConstant(0, DL, MVT::i32); // op_sel
+      NumOps = 8;
+    } else {
+      // Regular e64 format
+      Ops[0] = AlignBitSrc0;
+      Ops[1] = AlignBitSrc1;
+      Ops[2] = ShiftAmount;
+      NumOps = 3;
+    }
+
+    return CurDAG->getMachineNode(Opcode, DL, MVT::i32,
+                                  ArrayRef<SDValue>(Ops, NumOps));
+  };
+
+  // Case 1: Both shift amounts are constants
+  ConstantSDNode *ShlConstant = dyn_cast<ConstantSDNode>(ShlAmt);
+  ConstantSDNode *SrlConstant = dyn_cast<ConstantSDNode>(SrlAmt);
+
+  if (ShlConstant && SrlConstant) {
+    int64_t ShlVal = ShlConstant->getSExtValue();
+    int64_t SrlVal = SrlConstant->getSExtValue();
+
+    if (ShlVal + SrlVal != 32)
+      return nullptr;
+
+    // Create constant for shift amount
+    SDLoc DL(N);
+    SDValue ConstAmtNode = CurDAG->getTargetConstant(SrlVal, DL, MVT::i32);
+
+    return buildAlignBitInstruction(ShlSrc, SrlSrc, ConstAmtNode);
+  }
+
+  // Helper to extract shift amount from (some_value & 31) pattern
+  auto getShiftAmount = [&](SDValue ShiftAmtVal) -> SDValue {
+    if (ShiftAmtVal.getOpcode() == ISD::AND)
+      if (ConstantSDNode *MaskNode =
+              dyn_cast<ConstantSDNode>(ShiftAmtVal.getOperand(1)))
+        if (MaskNode->getZExtValue() == 31)
+          return ShiftAmtVal.getOperand(0);
+
+    return SDValue();
+  };
+
+  // Case 2: Variable shift amounts - check the AND pattern
+  SDValue ShlAmtSrc = getShiftAmount(ShlAmt);
+  SDValue SrlAmtSrc = getShiftAmount(SrlAmt);
+
+  if (!ShlAmtSrc || !SrlAmtSrc)
+    return nullptr;
+
+  // Check if SHL amount comes from NOT or NEG of the original amount
+  SDValue OriginalAmt;
+  bool IsRotatePattern = false;
+
+  if (ShlAmtSrc.getOpcode() == ISD::XOR) {
+    // FSHR pattern: SHL amount = (~original_amt) & 31
+    if (ConstantSDNode *XorMask =
+            dyn_cast<ConstantSDNode>(ShlAmtSrc.getOperand(1))) {
+      if (XorMask->getSExtValue() == -1) {
+        if (ShlAmtSrc.getOperand(0) == SrlAmtSrc) {
+          OriginalAmt = SrlAmtSrc;
+          IsRotatePattern = false;
+        }
+      }
+    }
+  } else if (ShlAmtSrc.getOpcode() == ISD::SUB) {
+    // ROTR pattern: SHL amount = (-original_amt) & 31 = (0 - original_amt) & 31
+    if (ConstantSDNode *SubLHS =
+            dyn_cast<ConstantSDNode>(ShlAmtSrc.getOperand(0))) {
+      if (SubLHS->getZExtValue() == 0) {
+        if (ShlAmtSrc.getOperand(1) == SrlAmtSrc) {
+          OriginalAmt = SrlAmtSrc;
+          IsRotatePattern = true;
+        }
+      }
+    }
+  }
+
+  if (!OriginalAmt)
+    return nullptr;
+
+  SDValue AlignBitSrc0 = ShlSrc;
+  SDValue AlignBitSrc1 = IsRotatePattern ? ShlSrc : SrlSrc;
+
+  return buildAlignBitInstruction(AlignBitSrc0, AlignBitSrc1, OriginalAmt);
+}
+
 AMDGPUDAGToDAGISelLegacy::AMDGPUDAGToDAGISelLegacy(TargetMachine &TM,
                                                    CodeGenOptLevel OptLevel)
     : SelectionDAGISelLegacy(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index acbab3d9e2d81..b73259054d581 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -284,6 +284,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   void SelectINTRINSIC_VOID(SDNode *N);
   void SelectWAVE_ADDRESS(SDNode *N);
   void SelectSTACKRESTORE(SDNode *N);
+  SDNode *selectRotateOrFunnelShiftPattern(SDNode *N);
 
 protected:
   // Include the pieces autogenerated from the target description.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index b037cdd5393ea..49d122a91c7e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -486,12 +486,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
   }
 
-  // The hardware supports 32-bit FSHR, but not FSHL.
-  setOperationAction(ISD::FSHR, MVT::i32, Legal);
+  if (Subtarget->isGCN()) {
+    setOperationAction(ISD::FSHR, MVT::i32, Expand);
+    setOperationAction(ISD::ROTR, {MVT::i32, MVT::i64}, Expand);
+  } else {
+    setOperationAction(ISD::FSHR, MVT::i32, Legal);
+    setOperationAction(ISD::ROTR, {MVT::i32, MVT::i64}, Legal);
+  }
 
   // The hardware supports 32-bit ROTR, but not ROTL.
   setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
-  setOperationAction(ISD::ROTR, MVT::i64, Expand);
 
   setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8975486caa770..78506d8976f22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -406,6 +406,231 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
 
+bool AMDGPUInstructionSelector::selectRotateOrFunnelShiftPattern(
+    MachineInstr &I) const {
+  Register DstReg = I.getOperand(0).getReg();
+  Register LHS = I.getOperand(1).getReg();
+  Register RHS = I.getOperand(2).getReg();
+
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+  if (!IsVALU)
+    return false;
+
+  // Check if this is a 32-bit operation
+  if (MRI->getType(DstReg).getSizeInBits() != 32)
+    return false;
+
+  MachineInstr *LHSInst = getDefIgnoringCopies(LHS, *MRI);
+  MachineInstr *RHSInst = getDefIgnoringCopies(RHS, *MRI);
+
+  MachineInstr *ShlInst = nullptr;
+  MachineInstr *SrlInst = nullptr;
+
+  // Check both orderings: (shl, srl) and (srl, shl)
+  bool IsLHSShl = LHSInst->getOpcode() == TargetOpcode::G_SHL;
+  bool IsRHSSrl = RHSInst->getOpcode() == TargetOpcode::G_LSHR;
+  bool IsLHSSrl = LHSInst->getOpcode() == TargetOpcode::G_LSHR;
+  bool IsRHSShl = RHSInst->getOpcode() == TargetOpcode::G_SHL;
+
+  if ((IsLHSShl && IsRHSSrl) || (IsLHSSrl && IsRHSShl)) {
+    ShlInst = IsLHSShl ? LHSInst : RHSInst;
+    SrlInst = IsRHSSrl ? RHSInst : LHSInst;
+  } else
+    return false;
+
+  // Extract the base sources, handling the legalizer's (src << 1) pattern
+  Register ShlSrc = ShlInst->getOperand(1).getReg();
+  Register SrlSrc = SrlInst->getOperand(1).getReg();
+
+  // Check if SHL source comes from (original_src << 1)
+  MachineInstr *PreShlInst = getDefIgnoringCopies(ShlSrc, *MRI);
+  if (PreShlInst && PreShlInst->getOpcode() == TargetOpcode::G_SHL) {
+    std::optional<ValueAndVReg> PreShlAmt = getIConstantVRegValWithLookThrough(
+        PreShlInst->getOperand(2).getReg(), *MRI);
+    if (PreShlAmt && PreShlAmt->Value.getZExtValue() == 1)
+      ShlSrc = PreShlInst->getOperand(1).getReg();
+  }
+  // Helper function to build AlignBit instruction
+  auto buildAlignBitInstruction = [&](Register AlignBitSrc0,
+                                      Register AlignBitSrc1,
+                                      Register ShiftAmount) -> bool {
+    const DebugLoc &DL = I.getDebugLoc();
+    MachineBasicBlock *BB = I.getParent();
+
+    // Select opcode based on subtarget features
+    unsigned Opcode =
+        STI.getGeneration() >= AMDGPUSubtarget::GFX11
+            ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+                                        : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+        : STI.hasTrue16BitInsts()
+            ? (STI.useRealTrue16Insts() ? AMDGPU::V_ALIGNBIT_B32_t16_e64
+                                        : AMDGPU::V_ALIGNBIT_B32_fake16_e64)
+            : AMDGPU::V_ALIGNBIT_B32_e64;
+
+    // Check constant bus restriction and copy SGPRs to VGPRs if needed
+    unsigned ConstantBusLimit = STI.getConstantBusLimit(Opcode);
+    unsigned SGPRCount = 0;
+
+    Register AlignBitSrc0ToUse = AlignBitSrc0;
+    Register AlignBitSrc1ToUse = AlignBitSrc1;
+    Register ShiftAmountToUse = ShiftAmount;
+
+    // Count SGPR operands
+    SGPRCount += (RBI.getRegBank(AlignBitSrc0, *MRI, TRI)->getID() ==
+                  AMDGPU::SGPRRegBankID)
+                     ? 1
+                     : 0;
+    SGPRCount += (RBI.getRegBank(AlignBitSrc1, *MRI, TRI)->getID() ==
+                  AMDGPU::SGPRRegBankID)
+                     ? 1
+                     : 0;
+    SGPRCount += (RBI.getRegBank(ShiftAmount, *MRI, TRI)->getID() ==
+                  AMDGPU::SGPRRegBankID)
+                     ? 1
+                     : 0;
+
+    // If we exceed the constant bus limit, copy SGPRs to VGPRs
+    if (SGPRCount > ConstantBusLimit) {
+      auto copyToVGPRIfNeeded = [&](Register &RegToUse, Register OrigReg) {
+        if (RBI.getRegBank(OrigReg, *MRI, TRI)->getID() ==
+                AMDGPU::SGPRRegBankID &&
+            SGPRCount > ConstantBusLimit) {
+          RegToUse = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+          BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_MOV_B32_e32), RegToUse)
+              .addReg(OrigReg);
+          SGPRCount--;
+        }
+      };
+
+      copyToVGPRIfNeeded(AlignBitSrc0ToUse, AlignBitSrc0);
+      copyToVGPRIfNeeded(AlignBitSrc1ToUse, AlignBitSrc1);
+      copyToVGPRIfNeeded(ShiftAmountToUse, ShiftAmount);
+    }
+
+    auto AlignBit = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg);
+
+    if (Opcode == AMDGPU::V_ALIGNBIT_B32_t16_e64 ||
+        Opcode == AMDGPU::V_ALIGNBIT_B32_fake16_e64) {
+      // t16/fake16 variants have extended operand format
+      AlignBit
+          .addImm(0)                 // src0_modifiers
+          .addReg(AlignBitSrc0ToUse) // src0
+          .addImm(0)                 // src1_modifiers
+          .addReg(AlignBitSrc1ToUse) // src1
+          .addImm(0)                 // src2_modifiers
+          .addReg(ShiftAmountToUse)  // src2
+          .addImm(0)                 // clamp
+          .addImm(0);                // op_sel
+    } else {
+      AlignBit.addReg(AlignBitSrc0ToUse)
+          .addReg(AlignBitSrc1ToUse)
+          .addReg(ShiftAmountToUse);
+    }
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*AlignBit, TII, TRI, RBI);
+  };
+
+  // Get shift amounts for both SHL and SRL
+  Register ShlAmtReg = ShlInst->getOperand(2).getReg();
+  Register SrlAmtReg = SrlInst->getOperand(2).getReg();
+
+  // Case 1: Both shift amounts are constants (may be through COPY instructions)
+  auto ShlConstVal = getIConstantVRegValWithLookThrough(ShlAmtReg, *MRI);
+  auto SrlConstVal = getIConstantVRegValWithLookThrough(SrlAmtReg, *MRI);
+
+  if (ShlConstVal && SrlConstVal) {
+    int64_t ShlVal = ShlConstVal->Value.getSExtValue();
+    int64_t SrlVal = SrlConstVal->Value.getSExtValue();
+
+    if (ShlVal + SrlVal != 32)
+      return false;
+
+    // Create a constant register for the original shift amount (SRL amount)
+    const DebugLoc &DL = I.getDebugLoc();
+    MachineBasicBlock *BB = I.getParent();
+
+    Register ConstAmtReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), ConstAmtReg)
+        .addImm(SrlVal);
+
+    return buildAlignBitInstruction(ShlSrc, SrlSrc, ConstAmtReg);
+  }
+
+  // Helper to extract shift amount from (some_value & 31) pattern
+  auto getShiftAmount = [&](Register ShiftAmtReg) -> std::optional<Register> {
+    MachineInstr *AndInst = getDefIgnoringCopies(ShiftAmtReg, *MRI);
+    if (AndInst && AndInst->getOpcode() == TargetOpcode::G_AND) {
+      Register AndSrc = AndInst->getOperand(1).getReg();
+      Register AndMask = AndInst->getOperand(2).getReg();
+
+      std::optional<ValueAndVReg> MaskVal =
+          getIConstantVRegValWithLookThrough(AndMask, *MRI);
+      if (MaskVal && MaskVal->Value.getZExtValue() == 31) {
+        return AndSrc;
+      }
+    }
+    return std::nullopt;
+  };
+
+  // Case 2: Variable shift amounts - check the AND/XOR pattern
+  auto ShlAmtSrc = getShiftAmount(ShlAmtReg);
+  auto SrlAmtSrc = getShiftAmount(SrlAmtReg);
+
+  if (!ShlAmtSrc || !SrlAmtSrc)
+    return false;
+
+  MachineInstr *ShlSrcInst = getDefIgnoringCopies(*ShlAmtSrc, *MRI);
+  if (!ShlSrcInst)
+    return false;
+
+  Register OriginalAmt;
+  bool IsRotatePattern = false;
+
+  if (ShlSrcInst->getOpcode() == TargetOpcode::G_XOR) {
+    // FSHR pattern: SHL amount = (~original_amt) & 31
+    Register XorSrc = ShlSrcInst->getOperand(1).getReg();
+    Register XorMask = ShlSrcInst->getOperand(2).getReg();
+
+    std::optional<ValueAndVReg> XorMaskVal =
+        getIConstantVRegValWithLookThrough(XorMask, *MRI);
+    if (!XorMaskVal || XorMaskVal->Value.getSExtValue() != -1)
+      return false;
+
+    if (XorSrc != *SrlAmtSrc)
+      return false;
+
+    OriginalAmt = *SrlAmtSrc;
+    IsRotatePattern = false;
+
+  } else if (ShlSrcInst->getOpcode() == TargetOpcode::G_SUB) {
+    // ROTR pattern: SHL amount = (-original_amt) & 31 = (0 - original_amt) & 31
+    Register SubLHS = ShlSrcInst->getOperand(1).getReg();
+    Register SubRHS = ShlSrcInst->getOperand(2).getReg();
+
+    std::optional<ValueAndVReg> SubLHSVal =
+        getIConstantVRegValWithLookThrough(SubLHS, *MRI);
+    if (!SubLHSVal || SubLHSVal->Value.getZExtValue() != 0)
+      return false;
+
+    if (SubRHS != *SrlAmtSrc)
+      return false;
+
+    OriginalAmt = *SrlAmtSrc;
+    IsRotatePattern = true;
+
+  } else
+    return false;
+
+  // Build V_ALIGNBIT_B32 instruction
+  Register AlignBitSrc0 = ShlSrc;
+  Register AlignBitSrc1 = IsRotatePattern ? ShlSrc : SrlSrc;
+  Register VarShiftAmount = OriginalAmt;
+
+  return buildAlignBitInstruction(AlignBitSrc0, AlignBitSrc1, VarShiftAmount);
+}
+
 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
@@ -4033,6 +4258,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_XOR:
     if (selectBITOP3(I))
       return true;
+    if (I.getOpcode() == TargetOpcode::G_OR && selectRotateOrFunnelShiftPattern(I))
+      return true;
     if (selectImpl(I, *CoverageInfo))
       return true;
     return selectG_AND_OR_XOR(I);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 34bdf0a6d4ab2..46cdf813330b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -97,6 +97,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectG_FNEG(MachineInstr &I) const;
   bool selectG_FABS(MachineInstr &I) const;
   bool selectG_AND_OR_XOR(MachineInstr &I) const;
+  bool selectRotateOrFunnelShiftPattern(MachineInstr &I) const;
   bool selectG_ADD_SUB(MachineInstr &I) const;
   bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const;
   bool selectG_AMDGPU_MAD_64_32(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e7bf88d2ee5b6..b1b19332d870c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2041,13 +2041,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(0, S32, S64)
     .lower();
 
-  getActionDefinitionsBuilder({G_ROTR, G_ROTL})
-    .scalarize(0)
-    .lower();
+  getActionDefinitionsBuilder({G_ROTR, G_ROTL}).scalarize(0).lower();
 
   // TODO: Only Try to form v2s16 with legal packed instructions.
   getActionDefinitionsBuilder(G_FSHR)
-    .legalFor({{S32, S32}})
     .lowerFor({{V2S16, V2S16}})
     .clampMaxNumElementsStrict(0, S16, 2)
     .scalarize(0)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index b54cccead9781..a280b84a4667b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4089,6 +4089,12 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_AMDGPU_SMED3:
   case AMDGPU::G_AMDGPU_FMED3:
     return getDefaultMappingVOP(MI);
+  case AMDGPU::G_ROTR:
+  case AMDGPU::G_ROTL: {
+    if (isSALUMapping(MI))
+      return getDefaultMappingSOP(MI);
+    return getDefaultMappingVOP(MI);
+  }
   case AMDGPU::G_UMULH:
   case AMDGPU::G_SMULH: {
     if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index fc81e16d68e98..3e65697c07450 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1768,102 +1768,102 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/149817