[llvm] [AMDGPU] Select scale_offset with SMEM instructions (PR #150078)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 22 11:26:58 PDT 2025
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/150078
>From 1f0f61fef3518f186a9eeafb5b0b7afe61dc6c7e Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 22 Jul 2025 11:20:34 -0700
Subject: [PATCH] [AMDGPU] Select scale_offset with SMEM instructions
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 143 +++++--
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 28 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 162 +++++++-
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 17 +-
llvm/lib/Target/AMDGPU/SMInstructions.td | 22 +-
llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll | 372 ++++++++++++++++++
6 files changed, 671 insertions(+), 73 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 00c7f0eb6e9f1..3412bb5acf28c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1872,6 +1872,23 @@ static SDValue matchZExtFromI32(SDValue Op) {
return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
}
+// If this matches *_extend i32:x, return x
+// Otherwise if the value is I32 returns x.
+static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
+ const SelectionDAG *DAG) {
+ if (Op.getValueType() == MVT::i32)
+ return Op;
+
+ if (Op.getOpcode() != (IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND) &&
+ Op.getOpcode() != ISD::ANY_EXTEND &&
+ !(DAG->SignBitIsZero(Op) &&
+ Op.getOpcode() == (IsSigned ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND)))
+ return SDValue();
+
+ SDValue ExtSrc = Op.getOperand(0);
+ return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
+}
+
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
SDValue Addr,
@@ -2159,17 +2176,59 @@ bool AMDGPUDAGToDAGISel::isSOffsetLegalWithImmOffset(SDValue *SOffset,
return true;
}
+// Given \p Offset and load node \p N check if an \p Offset is a multiple of
+// the load byte size. If it is update \p Offset to a pre-scaled value and
+// return true.
+bool AMDGPUDAGToDAGISel::SelectScaleOffset(SDNode *N, SDValue &Offset,
+ bool IsSigned) const {
+ bool ScaleOffset = false;
+ if (!Subtarget->hasScaleOffset() || !Offset)
+ return false;
+
+ unsigned Size =
+ (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+
+ SDValue Off = Offset;
+ if (SDValue Ext = matchExtFromI32orI32(Offset, IsSigned, CurDAG))
+ Off = Ext;
+
+ if (isPowerOf2_32(Size) && Off.getOpcode() == ISD::SHL) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Off.getOperand(1)))
+ ScaleOffset = C->getZExtValue() == Log2_32(Size);
+ } else if (Offset.getOpcode() == ISD::MUL ||
+ (IsSigned && Offset.getOpcode() == AMDGPUISD::MUL_I24) ||
+ Offset.getOpcode() == AMDGPUISD::MUL_U24 ||
+ (Offset.isMachineOpcode() &&
+ Offset.getMachineOpcode() ==
+ (IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO
+ : AMDGPU::S_MUL_U64_U32_PSEUDO))) {
+ if (auto *C = dyn_cast<ConstantSDNode>(Offset.getOperand(1)))
+ ScaleOffset = C->getZExtValue() == Size;
+ }
+
+ if (ScaleOffset)
+ Offset = Off.getOperand(0);
+
+ return ScaleOffset;
+}
+
// Match an immediate (if Offset is not null) or an SGPR (if SOffset is
// not null) offset. If Imm32Only is true, match only 32-bit immediate
// offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode,
SDValue *SOffset, SDValue *Offset,
bool Imm32Only, bool IsBuffer,
- bool HasSOffset,
- int64_t ImmOffset) const {
+ bool HasSOffset, int64_t ImmOffset,
+ bool *ScaleOffset) const {
assert((!SOffset || !Offset) &&
"Cannot match both soffset and offset at the same time!");
+ if (ScaleOffset) {
+ assert(N && SOffset);
+
+ *ScaleOffset = SelectScaleOffset(N, ByteOffsetNode, false /* IsSigned */);
+ }
+
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
if (!C) {
if (!SOffset)
@@ -2254,24 +2313,25 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
// Match a base and an immediate (if Offset is not null) or an SGPR (if
// SOffset is not null) or an immediate+SGPR offset. If Imm32Only is
// true, match only 32-bit immediate offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
- SDValue *SOffset, SDValue *Offset,
- bool Imm32Only, bool IsBuffer,
- bool HasSOffset,
- int64_t ImmOffset) const {
+bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDNode *N, SDValue Addr,
+ SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only,
+ bool IsBuffer, bool HasSOffset,
+ int64_t ImmOffset,
+ bool *ScaleOffset) const {
if (SOffset && Offset) {
assert(!Imm32Only && !IsBuffer);
SDValue B;
- if (!SelectSMRDBaseOffset(Addr, B, nullptr, Offset, false, false, true))
+ if (!SelectSMRDBaseOffset(N, Addr, B, nullptr, Offset, false, false, true))
return false;
int64_t ImmOff = 0;
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(*Offset))
ImmOff = C->getSExtValue();
- return SelectSMRDBaseOffset(B, SBase, SOffset, nullptr, false, false, true,
- ImmOff);
+ return SelectSMRDBaseOffset(N, B, SBase, SOffset, nullptr, false, false,
+ true, ImmOff, ScaleOffset);
}
// A 32-bit (address + offset) should not cause unsigned 32-bit integer
@@ -2291,23 +2351,25 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
if (!N0 || !N1)
return false;
- if (SelectSMRDOffset(N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
- ImmOffset)) {
+ if (SelectSMRDOffset(N, N1, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset, ScaleOffset)) {
SBase = N0;
return true;
}
- if (SelectSMRDOffset(N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
- ImmOffset)) {
+ if (SelectSMRDOffset(N, N0, SOffset, Offset, Imm32Only, IsBuffer, HasSOffset,
+ ImmOffset, ScaleOffset)) {
SBase = N1;
return true;
}
return false;
}
-bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase,
SDValue *SOffset, SDValue *Offset,
- bool Imm32Only) const {
- if (SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only)) {
+ bool Imm32Only, bool *ScaleOffset) const {
+ if (SelectSMRDBaseOffset(N, Addr, SBase, SOffset, Offset, Imm32Only,
+ /* IsBuffer */ false, /* HasSOffset */ false,
+ /* ImmOffset */ 0, ScaleOffset)) {
SBase = Expand32BitAddress(SBase);
return true;
}
@@ -2323,36 +2385,51 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
+ return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+ &Offset);
}
bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
SDValue &Offset) const {
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
- /* Imm32Only */ true);
+ return SelectSMRD(/* N */ nullptr, Addr, SBase, /* SOffset */ nullptr,
+ &Offset, /* Imm32Only */ true);
}
-bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
- SDValue &SOffset) const {
- return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue &SOffset, SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectSMRD(N, Addr, SBase, &SOffset, /* Offset */ nullptr,
+ /* Imm32Only */ false, &ScaleOffset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(N), MVT::i32);
+ return true;
}
-bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
- SDValue &SOffset,
- SDValue &Offset) const {
- return SelectSMRD(Addr, SBase, &SOffset, &Offset);
+bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDNode *N, SDValue Addr,
+ SDValue &SBase, SDValue &SOffset,
+ SDValue &Offset,
+ SDValue &CPol) const {
+ bool ScaleOffset;
+ if (!SelectSMRD(N, Addr, SBase, &SOffset, &Offset, false, &ScaleOffset))
+ return false;
+
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(N), MVT::i32);
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue N, SDValue &Offset) const {
- return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+ return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
/* Imm32Only */ false, /* IsBuffer */ true);
}
bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue N,
SDValue &Offset) const {
assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
- return SelectSMRDOffset(N, /* SOffset */ nullptr, &Offset,
+ return SelectSMRDOffset(/* N */ nullptr, N, /* SOffset */ nullptr, &Offset,
/* Imm32Only */ true, /* IsBuffer */ true);
}
@@ -2361,9 +2438,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
// Match the (soffset + offset) pair as a 32-bit register base and
// an immediate offset.
return N.getValueType() == MVT::i32 &&
- SelectSMRDBaseOffset(N, /* SBase */ SOffset, /* SOffset*/ nullptr,
- &Offset, /* Imm32Only */ false,
- /* IsBuffer */ true);
+ SelectSMRDBaseOffset(/* N */ nullptr, N, /* SBase */ SOffset,
+ /* SOffset*/ nullptr, &Offset,
+ /* Imm32Only */ false, /* IsBuffer */ true);
}
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index acbab3d9e2d81..f7c7b3e144758 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -176,22 +176,28 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &SAddr, SDValue &Offset) const;
- bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue *SOffset,
+ bool SelectSMRDOffset(SDNode *N, SDValue ByteOffsetNode, SDValue *SOffset,
SDValue *Offset, bool Imm32Only = false,
bool IsBuffer = false, bool HasSOffset = false,
- int64_t ImmOffset = 0) const;
+ int64_t ImmOffset = 0,
+ bool *ScaleOffset = nullptr) const;
SDValue Expand32BitAddress(SDValue Addr) const;
- bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
- SDValue *Offset, bool Imm32Only = false,
- bool IsBuffer = false, bool HasSOffset = false,
- int64_t ImmOffset = 0) const;
- bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
- SDValue *Offset, bool Imm32Only = false) const;
+ bool SelectSMRDBaseOffset(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue *SOffset, SDValue *Offset,
+ bool Imm32Only = false, bool IsBuffer = false,
+ bool HasSOffset = false, int64_t ImmOffset = 0,
+ bool *ScaleOffset = nullptr) const;
+ bool SelectSMRD(SDNode *N, SDValue Addr, SDValue &SBase, SDValue *SOffset,
+ SDValue *Offset, bool Imm32Only = false,
+ bool *ScaleOffset = nullptr) const;
bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
- bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
- bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset,
- SDValue &Offset) const;
+ bool SelectScaleOffset(SDNode *N, SDValue &Offset, bool IsSigned) const;
+ bool SelectSMRDSgpr(SDNode *N, SDValue Addr, SDValue &SBase, SDValue &SOffset,
+ SDValue &CPol) const;
+ bool SelectSMRDSgprImm(SDNode *N, SDValue Addr, SDValue &SBase,
+ SDValue &SOffset, SDValue &Offset,
+ SDValue &CPol) const;
bool SelectSMRDBufferImm(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferImm32(SDValue N, SDValue &Offset) const;
bool SelectSMRDBufferSgprImm(SDValue N, SDValue &SOffset,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 8975486caa770..d2e718c1272f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3494,25 +3494,74 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
}
/// Match a zero extend from a 32-bit value to 64-bits.
-static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
+Register AMDGPUInstructionSelector::matchZeroExtendFromS32(Register Reg) const {
Register ZExtSrc;
- if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
- return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
+ if (mi_match(Reg, *MRI, m_GZExt(m_Reg(ZExtSrc))))
+ return MRI->getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
- const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
return Register();
assert(Def->getNumOperands() == 3 &&
- MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
- if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ if (mi_match(Def->getOperand(2).getReg(), *MRI, m_ZeroInt())) {
return Def->getOperand(1).getReg();
}
return Register();
}
+/// Match a sign extend from a 32-bit value to 64-bits.
+Register AMDGPUInstructionSelector::matchSignExtendFromS32(Register Reg) const {
+ Register SExtSrc;
+ if (mi_match(Reg, *MRI, m_GSExt(m_Reg(SExtSrc))))
+ return MRI->getType(SExtSrc) == LLT::scalar(32) ? SExtSrc : Register();
+
+ // Match legalized form %sext = G_MERGE_VALUES (s32 %x), G_ASHR((S32 %x, 31))
+ const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+ if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
+ return Register();
+
+ assert(Def->getNumOperands() == 3 &&
+ MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
+ if (mi_match(Def->getOperand(2).getReg(), *MRI,
+ m_GAShr(m_SpecificReg(Def->getOperand(1).getReg()),
+ m_SpecificICst(31))))
+ return Def->getOperand(1).getReg();
+
+ if (VT->signBitIsZero(Reg))
+ return matchZeroExtendFromS32(Reg);
+
+ return Register();
+}
+
+/// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchZeroExtendFromS32OrS32(Register Reg) const {
+ return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+ : matchZeroExtendFromS32(Reg);
+}
+
+/// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+/// is 32-bit.
+Register
+AMDGPUInstructionSelector::matchSignExtendFromS32OrS32(Register Reg) const {
+ return MRI->getType(Reg) == LLT::scalar(32) ? Reg
+ : matchSignExtendFromS32(Reg);
+}
+
+Register
+AMDGPUInstructionSelector::matchExtendFromS32OrS32(Register Reg,
+ bool IsSigned) const {
+ if (IsSigned)
+ return matchSignExtendFromS32OrS32(Reg);
+
+ return matchZeroExtendFromS32OrS32(Reg);
+}
+
Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const {
Register AnyExtSrc;
if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc))))
@@ -3581,7 +3630,7 @@ bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
if (isSGPR(SAddr)) {
Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
- if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ if (Register Off = matchZeroExtendFromS32(PtrBaseOffset)) {
Addr = SAddr;
VOffset = Off;
}
@@ -5223,7 +5272,7 @@ AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const {
getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
unsigned Key = 0;
- Register S32 = matchZeroExtendFromS32(*MRI, Src);
+ Register S32 = matchZeroExtendFromS32(Src);
if (!S32)
S32 = matchAnyExtendFromS32(Src);
@@ -5296,10 +5345,68 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
}};
}
+// Given \p Offset and load specified by the \p Root operand check if \p Offset
+// is a multiple of the load byte size. If it is update \p Offset to a
+// pre-scaled value and return true.
+bool AMDGPUInstructionSelector::selectScaleOffset(MachineOperand &Root,
+ Register &Offset,
+ bool IsSigned) const {
+ if (!Subtarget->hasScaleOffset())
+ return false;
+
+ const MachineInstr &MI = *Root.getParent();
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+
+ if (!MMO->getSize().hasValue())
+ return false;
+
+ uint64_t Size = MMO->getSize().getValue();
+
+ Register OffsetReg = matchExtendFromS32OrS32(Offset, IsSigned);
+ if (!OffsetReg)
+ OffsetReg = Offset;
+
+ if (auto Def = getDefSrcRegIgnoringCopies(OffsetReg, *MRI))
+ OffsetReg = Def->Reg;
+
+ Register Op0;
+ MachineInstr *Mul;
+ bool ScaleOffset =
+ (isPowerOf2_64(Size) &&
+ mi_match(OffsetReg, *MRI,
+ m_GShl(m_Reg(Op0),
+ m_any_of(m_SpecificICst(Log2_64(Size)),
+ m_Copy(m_SpecificICst(Log2_64(Size))))))) ||
+ mi_match(OffsetReg, *MRI,
+ m_GMul(m_Reg(Op0), m_any_of(m_SpecificICst(Size),
+ m_Copy(m_SpecificICst(Size))))) ||
+ mi_match(
+ OffsetReg, *MRI,
+ m_BinOp(IsSigned ? AMDGPU::S_MUL_I64_I32_PSEUDO : AMDGPU::S_MUL_U64,
+ m_Reg(Op0), m_SpecificICst(Size))) ||
+ // Match G_AMDGPU_MAD_U64_U32 offset, c, 0
+ (mi_match(OffsetReg, *MRI, m_MInstr(Mul)) &&
+ (Mul->getOpcode() == (IsSigned ? AMDGPU::G_AMDGPU_MAD_I64_I32
+ : AMDGPU::G_AMDGPU_MAD_U64_U32) ||
+ (IsSigned && Mul->getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32 &&
+ VT->signBitIsZero(Mul->getOperand(2).getReg()))) &&
+ mi_match(Mul->getOperand(4).getReg(), *MRI, m_ZeroInt()) &&
+ mi_match(Mul->getOperand(3).getReg(), *MRI,
+ m_GTrunc(m_any_of(m_SpecificICst(Size),
+ m_Copy(m_SpecificICst(Size))))) &&
+ mi_match(Mul->getOperand(2).getReg(), *MRI, m_Reg(Op0)));
+
+ if (ScaleOffset)
+ Offset = Op0;
+
+ return ScaleOffset;
+}
+
bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
Register &Base,
Register *SOffset,
- int64_t *Offset) const {
+ int64_t *Offset,
+ bool *ScaleOffset) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
@@ -5314,6 +5421,9 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
const GEPInfo &GEPI = AddrInfo[0];
std::optional<int64_t> EncodedImm;
+ if (ScaleOffset)
+ *ScaleOffset = false;
+
if (SOffset && Offset) {
EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, /*IsBuffer=*/false,
/*HasSOffset=*/true);
@@ -5321,8 +5431,12 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
AddrInfo.size() > 1) {
const GEPInfo &GEPI2 = AddrInfo[1];
if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
- if (Register OffsetReg =
- matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
+ Register OffsetReg = GEPI2.SgprParts[1];
+ if (ScaleOffset)
+ *ScaleOffset =
+ selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+ OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+ if (OffsetReg) {
Base = GEPI2.SgprParts[0];
*SOffset = OffsetReg;
*Offset = *EncodedImm;
@@ -5367,7 +5481,11 @@ bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
}
if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
- if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
+ Register OffsetReg = GEPI.SgprParts[1];
+ if (ScaleOffset)
+ *ScaleOffset = selectScaleOffset(Root, OffsetReg, false /* IsSigned */);
+ OffsetReg = matchZeroExtendFromS32OrS32(OffsetReg);
+ if (OffsetReg) {
Base = GEPI.SgprParts[0];
*SOffset = OffsetReg;
return true;
@@ -5381,7 +5499,8 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
Register Base;
int64_t Offset;
- if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
+ if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset,
+ /* ScaleOffset */ nullptr))
return std::nullopt;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
@@ -5412,23 +5531,30 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
Register Base, SOffset;
- if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
+ bool ScaleOffset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr,
+ &ScaleOffset))
return std::nullopt;
+ unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
- [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
}
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
Register Base, SOffset;
int64_t Offset;
- if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
+ bool ScaleOffset;
+ if (!selectSmrdOffset(Root, Base, &SOffset, &Offset, &ScaleOffset))
return std::nullopt;
+ unsigned CPol = ScaleOffset ? AMDGPU::CPol::SCAL : 0;
return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPol); }}};
}
std::pair<Register, int>
@@ -5565,7 +5691,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// It's possible voffset is an SGPR here, but the copy to VGPR will be
// inserted later.
- if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
+ if (Register VOffset = matchZeroExtendFromS32(PtrBaseOffset)) {
return {{[=](MachineInstrBuilder &MIB) { // saddr
MIB.addReg(SAddr);
},
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 34bdf0a6d4ab2..e58fbb48ffb20 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -232,8 +232,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
InstructionSelector::ComplexRendererFns
selectVINTERPModsHi(MachineOperand &Root) const;
+ bool selectScaleOffset(MachineOperand &Root, Register &Offset,
+ bool IsSigned) const;
bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
- int64_t *Offset) const;
+ int64_t *Offset, bool *ScaleOffset) const;
InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -421,6 +423,19 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
// shift amount operand's `ShAmtBits` bits is unneeded.
bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const;
+ /// Match a zero extend from a 32-bit value to 64-bits.
+ Register matchZeroExtendFromS32(Register Reg) const;
+ /// Match a sign extend from a 32-bit value to 64-bits.
+ Register matchSignExtendFromS32(Register Reg) const;
+ /// Match a zero extend from a 32-bit value to 64-bits, or \p Reg itself if it
+ /// is 32-bit.
+ Register matchZeroExtendFromS32OrS32(Register Reg) const;
+ /// Match a sign extend from a 32-bit value to 64-bits, or \p Reg itself if it
+ /// is 32-bit.
+ Register matchSignExtendFromS32OrS32(Register Reg) const;
+ /// Match either sign or zero extend depending on the \p IsSigned from a
+ /// 32-bit value to 64-bits, or \p Reg itself if it is 32-bit.
+ Register matchExtendFromS32OrS32(Register Reg, bool IsSigned) const;
/// Match an any extend from a 32-bit value to 64-bit.
Register matchAnyExtendFromS32(Register Reg) const;
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 0850c41c933de..38cc51b8ab32b 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -864,8 +864,10 @@ def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
-def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
-def SMRDSgprImm : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
+let WantsRoot = true in {
+ def SMRDSgpr : ComplexPattern<iPTR, 3, "SelectSMRDSgpr", [], [], -3>;
+ def SMRDSgprImm : ComplexPattern<iPTR, 4, "SelectSMRDSgprImm", [], []>;
+}
def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
@@ -906,15 +908,15 @@ multiclass SMRD_Patterns <string Instr, ValueType vt, PatFrag frag,
let SubtargetPredicate = isNotGFX9Plus;
}
def : GCNPat <
- (frag (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, 0))> {
+ (frag (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, 0, $cpol))> {
let SubtargetPredicate = isGFX9Plus;
}
// 4. SGPR+IMM offset
def : GCNPat <
- (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, 0))> {
+ (frag (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM"#suffix) $sbase, $soffset, $offset, $cpol))> {
let SubtargetPredicate = isGFX9Plus;
}
@@ -989,15 +991,15 @@ multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, Val
// 2. SGPR offset
def : GCNPat <
- (node (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
+ (node (SMRDSgpr i64:$sbase, i32:$soffset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, $cpol))>{
let SubtargetPredicate = isGFX12Plus;
}
// 3. SGPR+IMM offset
def : GCNPat <
- (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
+ (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset, CPol:$cpol)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, $cpol))>{
let SubtargetPredicate = isGFX12Plus;
}
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
new file mode 100644
index 0000000000000..b5bb68e1eaa89
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
@@ -0,0 +1,372 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+; 'i32 %idx' is a signed index while SMRD soffset is unsigned, thus it is not selected.
+
+define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idx32:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_ashr_i32 s3, s2, 31
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idx32:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_ashr_i32 s3, s2, 31
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 2
+; GISEL-NEXT: s_add_co_u32 s0, s0, s2
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i32 %idx
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; SDAG-LABEL: s_load_b32_idxprom_wrong_stride:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_mov_b32 s3, 0
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; SDAG-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; SDAG-NEXT: s_load_b32 s0, s[0:1], 0x0
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: v_mov_b32_e32 v0, s0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: s_load_b32_idxprom_wrong_stride:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_mov_b32 s3, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GISEL-NEXT: s_lshl_b64 s[2:3], s[2:3], 3
+; GISEL-NEXT: s_add_co_u32 s0, s0, s2
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, s3
+; GISEL-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b16_idxprom_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b96_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b128_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b256_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
+; GCN-LABEL: s_load_b512_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <16 x float> %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b32_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds float, ptr addrspace(4) %p, i64 %idxadd
+ %ret = load float, ptr addrspace(4) %arrayidx, align 4
+ ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b8_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_u8 s0, s[0:1], s2 offset:0x10
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i8, ptr addrspace(4) %p, i64 %idxadd
+ %ld = load i8, ptr addrspace(4) %arrayidx
+ %ret.i32 = zext i8 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxprom
+ %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b16_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(4) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(4) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b64_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b96_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b96 s[0:2], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b128_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b128 s[0:3], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b256_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b256 s[0:7], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <8 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <8 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <8 x float> %ret
+}
+
+define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
+; GCN-LABEL: s_load_b512_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b512 s[0:15], s[0:1], s2 offset:0x0 scale_offset
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GCN-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GCN-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GCN-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GCN-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
+; GCN-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; GCN-NEXT: v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s13
+; GCN-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_mov_b32 v15, s15
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(4) %p, align 4, !range !0
+ %idxprom = zext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <16 x float>, ptr addrspace(4) %p, i64 %idxprom
+ %ret = load <16 x float>, ptr addrspace(4) %arrayidx, align 4
+ ret <16 x float> %ret
+}
+
+!0 = !{i32 0, i32 1024}
More information about the llvm-commits
mailing list