[llvm] [AMDGPU] Select scale_offset with SMEM instructions (PR #150078)

Tue Jul 22 11:26:58 PDT 2025

https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/150078

>From 1f0f61fef3518f186a9eeafb5b0b7afe61dc6c7e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 22 Jul 2025 11:20:34 -0700
Subject: [PATCH] [AMDGPU] Select scale_offset with SMEM instructions

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 143 +++++--
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |  28 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      | 162 +++++++-
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |  17 +-
 llvm/lib/Target/AMDGPU/SMInstructions.td      |  22 +-
 llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll | 372 ++++++++++++++++++
 6 files changed, 671 insertions(+), 73 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 00c7f0eb6e9f1..3412bb5acf28c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1872,6 +1872,23 @@ static SDValue matchZExtFromI32(SDValue Op) {
   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
 }
 
+// If this matches *_extend i32:x, return x
+// Otherwise if the value is I32 returns x.
+static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
+                                    const SelectionDAG *DAG) {
+  if (Op.getValueType() == MVT::i32)
+    return Op;
+
+  if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
+      Op.getOpcode() != ISD::ANY_EXTEND &&
+      !(DAG->SignBitIsZero(Op) &&
+        Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
+    return SDValue();
+
+  SDValue ExtSrc = Op.getOperand(0);
+  return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
+}
+
 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
                                            SDValue Addr,
@@ -2159,17 +2176,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
   return true;
 }
 
+// Given \p Offset and load node \p N check if an \p Offset is a multiple of
+// the load byte size. If it is update \p Offset to a pre-scaled value and
+// return true.
+bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
+                                           bool IsSigned) const {
+  bool ScaleOffset = false;
+  if (!Subtarget->hasScaleOffset() || !Offset)
+    return false;
+
+  unsigned Size =
+      (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+
+  SDValue Off = Offset;
+  if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
+    Off = Ext;
+
+  if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
+      ScaleOffset = C->getZExtValue() == Log2_32(Size);
+  } else if (Offset.getOpcode() == ISD::MUL ||
+             (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
+             Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
+             (Offset.isMachineOpcode() &&
+              Offset.getMachineOpcode() ==
+                  (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
+                            : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
+    if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
+      ScaleOffset = C->getZExtValue() == Size;
+  }
+
+  if (ScaleOffset)
+    Offset = Off.getOperand(0);
+
+  return ScaleOffset;
+}
+
 // Match an immediate (if Offset is not null) or an SGPR (if SOffset is
 // not null) offset. If Imm32Only is true, match only 32-bit immediate
 // offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
                                           SDValue *SOffset, SDValue *Offset,
                                           bool Imm32Only, bool IsBuffer,
-                                          bool HasSOffset,
-                                          int64_t ImmOffset) const {
+                                          bool HasSOffset, int64_t ImmOffset,
+                                          bool *ScaleOffset) const {
   assert((!SOffset || !Offset) &&
          "Cannot match both soffset and offset at the same time!");
 
+  if (ScaleOffset) {
+    assert(N && SOffset);
+
+    *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
+  }
+
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
   if (!C) {
     if (!SOffset)
@@ -2254,24 +2313,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
 // Match a base and an immediate (if Offset is not null) or an SGPR (if
 // SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
 // true, match only 32-bit immediate offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
-                                              SDValue *SOffset, SDValue *Offset,
-                                              bool Imm32Only, bool IsBuffer,
-                                              bool HasSOffset,
-                                              int64_t ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
+                                              SDValue &SBase, SDValue *SOffset,
+                                              SDValue *Offset, bool Imm32Only,
+                                              bool IsBuffer, bool HasSOffset,
+                                              int64_t ImmOffset,
+                                              bool *ScaleOffset) const {
   if (SOffset && Offset) {
     assert(!Imm32Only && !IsBuffer);
     SDValue B;
 
-    if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+    if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
       return false;
 
     int64_t ImmOff = 0;
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
       ImmOff = C->getSExtValue();
 
-    return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
-                                ImmOff);
+    return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
+                                true, ImmOff, ScaleOffset);
   }
 
   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2291,23 +2351,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
   if (!N0 || !N1)
     return false;
 
-  if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
-                       ImmOffset)) {
+  if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+                       ImmOffset, ScaleOffset)) {
     SBase = N0;
     return true;
   }
-  if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
-                       ImmOffset)) {
+  if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+                       ImmOffset, ScaleOffset)) {
     SBase = N1;
     return true;
   }
   return false;
 }
 
-bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
                                     SDValue *SOffset, SDValue *Offset,
-                                    bool Imm32Only) const {
-  if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
+                                    bool Imm32Only, bool *ScaleOffset) const {
+  if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
+                           /* IsBuffer */ false, /* HasSOffset */ false,
+                           /* ImmOffset */ 0, ScaleOffset)) {
     SBase = Expand32BitAddress(SBase);
     return true;
   }
@@ -2323,36 +2385,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
 
 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
                                        SDValue &Offset) const {
-  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
+  return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+                    &Offset);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
                                          SDValue &Offset) const {
   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
-  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
-                    /* Imm32Only */ true);
+  return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+                    &Offset, /* Imm32Only */ true);
 }
 
-bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
-                                        SDValue &SOffset) const {
-  return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
+                                        SDValue &SOffset, SDValue &CPol) const {
+  bool ScaleOffset;
+  if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
+                  /* Imm32Only */ false, &ScaleOffset))
+    return false;
+
+  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+                                   SDLoc(N), MVT::i32);
+  return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
-                                           SDValue &SOffset,
-                                           SDValue &Offset) const {
-  return SelectSMRD(Addr, SBase, &SOffset, &Offset);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
+                                           SDValue &SBase, SDValue &SOffset,
+                                           SDValue &Offset,
+                                           SDValue &CPol) const {
+  bool ScaleOffset;
+  if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
+    return false;
+
+  CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+                                   SDLoc(N), MVT::i32);
+  return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
-  return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+  return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
                           /* Imm32Only */ false, /* IsBuffer */ true);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
                                                SDValue &Offset) const {
   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
-  return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+  return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
                           /* Imm32Only */ true, /* IsBuffer */ true);
 }
 
@@ -2361,9 +2438,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
   // Match the (soffset + offset) pair as a 32-bit register base and
   // an immediate offset.
   return N.getValueType() == MVT::i32 &&
-         SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
-                              &Offset, /* Imm32Only */ false,
-                              /* IsBuffer */ true);
+         SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
+                              /* SOffset*/ nullptr, &Offset,
+                              /* Imm32Only */ false, /* IsBuffer */ true);
 }
 
 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index acbab3d9e2d81..f7c7b3e144758 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -176,22 +176,28 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
                            SDValue &SAddr, SDValue &Offset) const;
 
-  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
+  bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset,
                         SDValue *Offset, bool Imm32Only = false,
                         bool IsBuffer = false, bool HasSOffset = false,
-                        int64_t ImmOffset = 0) const;
+                        int64_t ImmOffset = 0,
+                        bool *ScaleOffset = nullptr) const;
   SDValue Expand32BitAddress(SDValue Addr) const;
-  bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
-                            SDValue *Offset, bool Imm32Only = false,
-                            bool IsBuffer = false, bool HasSOffset = false,
-                            int64_t ImmOffset = 0) const;
-  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
-                  SDValue *Offset, bool Imm32Only = false) const;
+  bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase,
+                            SDValue *SOffset, SDValue *Offset,
+                            bool Imm32Only = false, bool IsBuffer = false,
+                            bool HasSOffset = false, int64_t ImmOffset = 0,
+                            bool *ScaleOffset = nullptr) const;
+  bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset,
+                  SDValue *Offset, bool Imm32Only = false,
+                  bool *ScaleOffset = nullptr) const;
   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
   bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
-  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
-  bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset,
-                         SDValue &Offset) const;
+  bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const;
+  bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset,
+                      SDValue &CPol) const;
+  bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase,
+                         SDValue &SOffset, SDValue &Offset,
+                         SDValue &CPol) const;
   bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const;
   bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
   bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8975486caa770..d2e718c1272f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
 }
 
 /// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
   Register ZExtSrc;
-  if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
-    return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+  if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
+    return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
 
   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
-  const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
     return Register();
 
   assert(Def->getNumOperands() == 3 &&
-         MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
-  if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+         MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+  if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
     return Def->getOperand(1).getReg();
   }
 
   return Register();
 }
 
+/// Match a sign extend from a 32-bit value to 64-bits.
+Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
+  Register SExtSrc;
+  if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
+    return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
+
+  // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+  if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+    return Register();
+
+  assert(Def->getNumOperands() == 3 &&
+         MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+  if (mi_match(Def->getOperand(2).getReg(), *MRI,
+               m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
+                       m_SpecificICst(31))))
+    return Def->getOperand(1).getReg();
+
+  if (VT->signBitIsZero(Reg))
+    return matchZeroExtendFromS32(Reg);
+
+  return Register();
+}
+
+/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
+  return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+                                              : matchZeroExtendFromS32(Reg);
+}
+
+/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
+  return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+                                              : matchSignExtendFromS32(Reg);
+}
+
+Register
+AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
+                                                   bool IsSigned) const {
+  if (IsSigned)
+    return matchSignExtendFromS32OrS32(Reg);
+
+  return matchZeroExtendFromS32OrS32(Reg);
+}
+
 Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
   Register AnyExtSrc;
   if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
@@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
           getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
       if (isSGPR(SAddr)) {
         Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
-        if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+        if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
           Addr = SAddr;
           VOffset = Off;
         }
@@ -5223,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
       getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
   unsigned Key = 0;
 
-  Register S32 = matchZeroExtendFromS32(*MRI, Src);
+  Register S32 = matchZeroExtendFromS32(Src);
   if (!S32)
     S32 = matchAnyExtendFromS32(Src);
 
@@ -5296,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
   }};
 }
 
+// Given \p Offset and load specified by the \p Root operand check if \p Offset
+// is a multiple of the load byte size. If it is update \p Offset to a
+// pre-scaled value and return true.
+bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
+                                                  Register &Offset,
+                                                  bool IsSigned) const {
+  if (!Subtarget->hasScaleOffset())
+    return false;
+
+  const MachineInstr &MI = *Root.getParent();
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+
+  if (!MMO->getSize().hasValue())
+    return false;
+
+  uint64_t Size = MMO->getSize().getValue();
+
+  Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
+  if (!OffsetReg)
+    OffsetReg = Offset;
+
+  if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
+    OffsetReg = Def->Reg;
+
+  Register Op0;
+  MachineInstr *Mul;
+  bool ScaleOffset =
+      (isPowerOf2_64(Size) &&
+       mi_match(OffsetReg, *MRI,
+                m_GShl(m_Reg(Op0),
+                       m_any_of(m_SpecificICst(Log2_64(Size)),
+                                m_Copy(m_SpecificICst(Log2_64(Size))))))) ||
+      mi_match(OffsetReg, *MRI,
+               m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size),
+                                           m_Copy(m_SpecificICst(Size))))) ||
+      mi_match(
+          OffsetReg, *MRI,
+          m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
+                  m_Reg(Op0), m_SpecificICst(Size))) ||
+      // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
+      (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
+       (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
+                                      : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
+        (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
+         VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
+       mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
+       mi_match(Mul->getOperand(3).getReg(), *MRI,
+                m_GTrunc(m_any_of(m_SpecificICst(Size),
+                                  m_Copy(m_SpecificICst(Size))))) &&
+       mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
+
+  if (ScaleOffset)
+    Offset = Op0;
+
+  return ScaleOffset;
+}
+
 bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
                                                  Register &Base,
                                                  Register *SOffset,
-                                                 int64_t *Offset) const {
+                                                 int64_t *Offset,
+                                                 bool *ScaleOffset) const {
   MachineInstr *MI = Root.getParent();
   MachineBasicBlock *MBB = MI->getParent();
 
@@ -5314,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
   const GEPInfo &GEPI = AddrInfo[0];
   std::optional<int64_t> EncodedImm;
 
+  if (ScaleOffset)
+    *ScaleOffset = false;
+
   if (SOffset && Offset) {
     EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
                                               /*HasSOffset=*/true);
@@ -5321,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
         AddrInfo.size() > 1) {
       const GEPInfo &GEPI2 = AddrInfo[1];
       if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
-        if (Register OffsetReg =
-                matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
+        Register OffsetReg = GEPI2.SgprParts[1];
+        if (ScaleOffset)
+          *ScaleOffset =
+              selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+        OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+        if (OffsetReg) {
           Base = GEPI2.SgprParts[0];
           *SOffset = OffsetReg;
           *Offset = *EncodedImm;
@@ -5367,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
   }
 
   if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
-    if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
+    Register OffsetReg = GEPI.SgprParts[1];
+    if (ScaleOffset)
+      *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+    OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+    if (OffsetReg) {
       Base = GEPI.SgprParts[0];
       *SOffset = OffsetReg;
       return true;
@@ -5381,7 +5499,8 @@ InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
   Register Base;
   int64_t Offset;
-  if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
+  if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
+                        /* ScaleOffset */ nullptr))
     return std::nullopt;
 
   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
@@ -5412,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
   Register Base, SOffset;
-  if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
+  bool ScaleOffset;
+  if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
+                        &ScaleOffset))
     return std::nullopt;
 
+  unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
-           [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
+           [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
 }
 
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
   Register Base, SOffset;
   int64_t Offset;
-  if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
+  bool ScaleOffset;
+  if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
     return std::nullopt;
 
+  unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
   return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
            [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
-           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
 }
 
 std::pair<Register, int>
@@ -5565,7 +5691,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
 
       // It's possible voffset is an SGPR here, but the copy to VGPR will be
       // inserted later.
-      if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+      if (Register VOffset = matchZeroExtendFromS32(PtrBaseOffset)) {
         return {{[=](MachineInstrBuilder &MIB) { // saddr
                    MIB.addReg(SAddr);
                  },
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 34bdf0a6d4ab2..e58fbb48ffb20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -232,8 +232,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectVINTERPModsHi(MachineOperand &Root) const;
 
+  bool selectScaleOffset(MachineOperand &Root, Register &Offset,
+                         bool IsSigned) const;
   bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
-                        int64_t *Offset) const;
+                        int64_t *Offset, bool *ScaleOffset) const;
   InstructionSelector::ComplexRendererFns
   selectSmrdImm(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
@@ -421,6 +423,19 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   // shift amount operand's `ShAmtBits` bits is unneeded.
   bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
 
+  /// Match a zero extend from a 32-bit value to 64-bits.
+  Register matchZeroExtendFromS32(Register Reg) const;
+  /// Match a sign extend from a 32-bit value to 64-bits.
+  Register matchSignExtendFromS32(Register Reg) const;
+  /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+  /// is 32-bit.
+  Register matchZeroExtendFromS32OrS32(Register Reg) const;
+  /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+  /// is 32-bit.
+  Register matchSignExtendFromS32OrS32(Register Reg) const;
+  /// Match either sign or zero extend depending on the \p IsSigned from a
+  /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit.
+  Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const;
   /// Match an any extend from a 32-bit value to 64-bit.
   Register matchAnyExtendFromS32(Register Reg) const;
 
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 0850c41c933de..38cc51b8ab32b 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -864,8 +864,10 @@ def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
 
 def SMRDImm         : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
 def SMRDImm32       : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
-def SMRDSgpr        : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
-def SMRDSgprImm     : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
+let WantsRoot = true in {
+  def SMRDSgpr        : ComplexPattern<iPTR, 3, "SelectSMRDSgpr", [], [], -3>;
+  def SMRDSgprImm     : ComplexPattern<iPTR, 4, "SelectSMRDSgprImm", [], []>;
+}
 def SMRDBufferImm   : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
 def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
@@ -906,15 +908,15 @@ multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag,
     let SubtargetPredicate = isNotGFX9Plus;
   }
   def : GCNPat <
-    (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> {
+    (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> {
     let SubtargetPredicate = isGFX9Plus;
   }
 
   // 4. SGPR+IMM offset
   def : GCNPat <
-    (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> {
+    (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> {
     let SubtargetPredicate = isGFX9Plus;
   }
 
@@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
 
    // 2. SGPR offset
    def : GCNPat <
-     (node (SMRDSgpr i64:$sbase, i32:$soffset)),
-     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
+     (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{
        let SubtargetPredicate = isGFX12Plus;
    }
 
    // 3. SGPR+IMM offset
    def : GCNPat <
-     (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
-     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
+     (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+     (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{
        let SubtargetPredicate = isGFX12Plus;
    }
 
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
new file mode 100644
index 0000000000000..b5bb68e1eaa89
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
@@ -0,0 +1,372 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b32_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+; 'i32 %idx' is a signed index while SMRD soffset is unsigned, thus it is not selected.
+
+define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idx32:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_ashr_i32 s3, s2, 31
+; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT:    s_wait_kmcnt 0x0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idx32:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_ashr_i32 s3, s2, 31
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
+; GISEL-NEXT:    s_add_co_u32 s0, s0, s2
+; GISEL-NEXT:    s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    ; return to shader part epilog
+entry:
+  %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i32 %idx
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idxprom_wrong_stride:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_mov_b32 s3, 0
+; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT:    s_lshl_b64 s[2:3], s[2:3], 3
+; SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT:    s_wait_kmcnt 0x0
+; SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idxprom_wrong_stride:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_mov_b32 s3, 0
+; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT:    s_lshl_b64 s[2:3], s[2:3], 3
+; GISEL-NEXT:    s_add_co_u32 s0, s0, s2
+; GISEL-NEXT:    s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT:    s_wait_kmcnt 0x0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b16_idxprom_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+  %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b64_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b96_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b128_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b256_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b512_idxprom:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <16 x float> %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxadd
+  %ret = load float, ptr addrspace(4) %arrayidx, align 4
+  ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b8_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_u8 s0, s[0:1], s2 offset:0x10
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(4) %p, i64 %idxadd
+  %ld = load i8, ptr addrspace(4) %arrayidx
+  %ret.i32 = zext i8 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxprom
+  %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range_ioffset:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %idxadd = add i64 %idxprom, 16
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+  %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+  %ret.i32 = zext i16 %ld to i32
+  %ret = bitcast i32 %ret.i32 to float
+  ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b64_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b96_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b128_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b256_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b512_idxprom_range:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT:    v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT:    v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT:    v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT:    v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT:    ; return to shader part epilog
+entry:
+  %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+  %idxprom = zext i32 %idx to i64
+  %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+  %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+  ret <16 x float> %ret
+}
+
+!0 = !{i32 0, i32 1024}