[llvm] [AMDGPU] Select scale_offset for global instructions on gfx1250 (PR #150107)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 22 13:50:51 PDT 2025
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/150107
>From 7cfd54bf328cad8e8f2a85c2386131aada07e2b0 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Tue, 22 Jul 2025 13:42:05 -0700
Subject: [PATCH] [AMDGPU] Select scale_offset for global instructions on
gfx1250
---
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 85 ++--
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 3 +-
.../AMDGPU/AMDGPUInstructionSelector.cpp | 63 ++-
.../Target/AMDGPU/AMDGPUInstructionSelector.h | 3 +-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 +
.../AMDGPU/branch-relaxation-gfx1250.ll | 7 +-
llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 182 ++++----
llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll | 14 +-
llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll | 436 ++++++++++++++++++
.../CodeGen/AMDGPU/scale-offset-global.ll | 351 ++++++++++++++
10 files changed, 1003 insertions(+), 144 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 3412bb5acf28c..b15c946de16f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1863,15 +1863,6 @@ bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
SIInstrFlags::FlatScratch);
}
-// If this matches zero_extend i32:x, return x
-static SDValue matchZExtFromI32(SDValue Op) {
- if (Op.getOpcode() != ISD::ZERO_EXTEND)
- return SDValue();
-
- SDValue ExtSrc = Op.getOperand(0);
- return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
-}
-
// If this matches *_extend i32:x, return x
// Otherwise if the value is I32 returns x.
static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
@@ -1890,12 +1881,13 @@ static SDValue matchExtFromI32orI32(SDValue Op, bool IsSigned,
}
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
-bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
- SDValue Addr,
- SDValue &SAddr,
- SDValue &VOffset,
- SDValue &Offset) const {
+// or (64-bit SGPR base) + (sext vgpr offset) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
+ SDValue &SAddr, SDValue &VOffset,
+ SDValue &Offset, bool &ScaleOffset,
+ bool NeedIOffset) const {
int64_t ImmOffset = 0;
+ ScaleOffset = false;
// Match the immediate offset first, which canonically is moved as low as
// possible.
@@ -1905,7 +1897,8 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
+ if (NeedIOffset &&
+ TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal)) {
Addr = LHS;
ImmOffset = COffsetVal;
@@ -1915,11 +1908,14 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
// saddr + large_offset -> saddr +
// (voffset = large_offset & ~MaxOffset) +
// (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
- COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+ int64_t SplitImmOffset = 0, RemainderOffset = COffsetVal;
+ if (NeedIOffset) {
+ std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+ COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+ }
- if (isUInt<32>(RemainderOffset)) {
+ if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
+ : isUInt<32>(RemainderOffset)) {
SDNode *VMov = CurDAG->getMachineNode(
AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
@@ -1946,21 +1942,26 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
// Match the variable offset.
if (Addr.getOpcode() == ISD::ADD) {
LHS = Addr.getOperand(0);
- RHS = Addr.getOperand(1);
if (!LHS->isDivergent()) {
- // add (i64 sgpr), (zero_extend (i32 vgpr))
- if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
+ // add (i64 sgpr), (*_extend (i32 vgpr))
+ RHS = Addr.getOperand(1);
+ ScaleOffset = SelectScaleOffset(N, RHS, Subtarget->hasSignedGVSOffset());
+ if (SDValue ExtRHS = matchExtFromI32orI32(
+ RHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
SAddr = LHS;
- VOffset = ZextRHS;
+ VOffset = ExtRHS;
}
}
+ RHS = Addr.getOperand(1);
if (!SAddr && !RHS->isDivergent()) {
- // add (zero_extend (i32 vgpr)), (i64 sgpr)
- if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
+ // add (*_extend (i32 vgpr)), (i64 sgpr)
+ ScaleOffset = SelectScaleOffset(N, LHS, Subtarget->hasSignedGVSOffset());
+ if (SDValue ExtLHS = matchExtFromI32orI32(
+ LHS, Subtarget->hasSignedGVSOffset(), CurDAG)) {
SAddr = RHS;
- VOffset = ZextLHS;
+ VOffset = ExtLHS;
}
}
@@ -1970,6 +1971,27 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
}
}
+ if (Subtarget->hasScaleOffset() &&
+ (Addr.getOpcode() == (Subtarget->hasSignedGVSOffset()
+ ? AMDGPUISD::MAD_I64_I32
+ : AMDGPUISD::MAD_U64_U32) ||
+ (Addr.getOpcode() == AMDGPUISD::MAD_U64_U32 &&
+ CurDAG->SignBitIsZero(Addr.getOperand(0)))) &&
+ Addr.getOperand(0)->isDivergent() &&
+ isa<ConstantSDNode>(Addr.getOperand(1)) &&
+ !Addr.getOperand(2)->isDivergent()) {
+ // mad_u64_u32 (i32 vgpr), (i32 c), (i64 sgpr)
+ unsigned Size =
+ (unsigned)cast<MemSDNode>(N)->getMemoryVT().getFixedSizeInBits() / 8;
+ ScaleOffset = Addr.getConstantOperandVal(1) == Size;
+ if (ScaleOffset) {
+ SAddr = Addr.getOperand(2);
+ VOffset = Addr.getOperand(0);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
+ return true;
+ }
+ }
+
if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
isa<ConstantSDNode>(Addr))
return false;
@@ -1989,10 +2011,12 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
- if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;
- CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
+ CPol = CurDAG->getTargetConstant(ScaleOffset ? AMDGPU::CPol::SCAL : 0,
+ SDLoc(), MVT::i32);
return true;
}
@@ -2000,10 +2024,11 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr,
SDValue &SAddr, SDValue &VOffset,
SDValue &Offset,
SDValue &CPol) const {
- if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset))
+ bool ScaleOffset;
+ if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset, ScaleOffset))
return false;
- unsigned CPolVal = AMDGPU::CPol::GLC;
+ unsigned CPolVal = (ScaleOffset ? AMDGPU::CPol::SCAL : 0) | AMDGPU::CPol::GLC;
CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32);
return true;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index f7c7b3e144758..eb23e80943bb7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -162,7 +162,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
bool SelectScratchOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
- SDValue &VOffset, SDValue &Offset) const;
+ SDValue &VOffset, SDValue &Offset, bool &ScaleOffset,
+ bool NeedIOffset = true) const;
bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
SDValue &VOffset, SDValue &Offset,
SDValue &CPol) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index d2e718c1272f8..6b38beecea543 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5616,7 +5616,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
- unsigned CPolBits) const {
+ unsigned CPolBits,
+ bool NeedIOffset) const {
Register Addr = Root.getReg();
Register PtrBase;
int64_t ConstOffset;
@@ -5627,7 +5628,8 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
if (ConstOffset != 0) {
- if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+ if (NeedIOffset &&
+ TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
SIInstrFlags::FlatGlobal)) {
Addr = PtrBase;
ImmOffset = ConstOffset;
@@ -5640,11 +5642,15 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// saddr + large_offset -> saddr +
// (voffset = large_offset & ~MaxOffset) +
// (large_offset & MaxOffset);
- int64_t SplitImmOffset, RemainderOffset;
- std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
- ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
+ int64_t SplitImmOffset = 0, RemainderOffset = ConstOffset;
+ if (NeedIOffset) {
+ std::tie(SplitImmOffset, RemainderOffset) =
+ TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal);
+ }
- if (isUInt<32>(RemainderOffset)) {
+ if (Subtarget->hasSignedGVSOffset() ? isInt<32>(RemainderOffset)
+ : isUInt<32>(RemainderOffset)) {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
Register HighBits =
@@ -5654,12 +5660,22 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
HighBits)
.addImm(RemainderOffset);
+ if (NeedIOffset)
+ return {{
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(PtrBase);
+ }, // saddr
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(HighBits);
+ }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
+ }};
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
[=](MachineInstrBuilder &MIB) {
MIB.addReg(HighBits);
}, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); },
}};
}
@@ -5691,18 +5707,33 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
// It's possible voffset is an SGPR here, but the copy to VGPR will be
// inserted later.
- if (Register VOffset = matchZeroExtendFromS32(PtrBaseOffset)) {
+ bool ScaleOffset = selectScaleOffset(Root, PtrBaseOffset,
+ Subtarget->hasSignedGVSOffset());
+ if (Register VOffset = matchExtendFromS32OrS32(
+ PtrBaseOffset, Subtarget->hasSignedGVSOffset())) {
+ if (NeedIOffset)
+ return {{[=](MachineInstrBuilder &MIB) { // saddr
+ MIB.addReg(SAddr);
+ },
+ [=](MachineInstrBuilder &MIB) { // voffset
+ MIB.addReg(VOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // offset
+ MIB.addImm(ImmOffset);
+ },
+ [=](MachineInstrBuilder &MIB) { // cpol
+ MIB.addImm(CPolBits |
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
+ }}};
return {{[=](MachineInstrBuilder &MIB) { // saddr
MIB.addReg(SAddr);
},
[=](MachineInstrBuilder &MIB) { // voffset
MIB.addReg(VOffset);
},
- [=](MachineInstrBuilder &MIB) { // offset
- MIB.addImm(ImmOffset);
- },
[=](MachineInstrBuilder &MIB) { // cpol
- MIB.addImm(CPolBits);
+ MIB.addImm(CPolBits |
+ (ScaleOffset ? AMDGPU::CPol::SCAL : 0));
}}};
}
}
@@ -5723,10 +5754,16 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root,
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
.addImm(0);
+ if (NeedIOffset)
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
+ }};
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
[=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset
[=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol
}};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index e58fbb48ffb20..5f7f05c52ad21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -256,7 +256,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
selectScratchOffset(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
- selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const;
+ selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits,
+ bool NeedIOffset = true) const;
InstructionSelector::ComplexRendererFns
selectGlobalSAddr(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 407d79a30599b..56851571c6c68 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1167,6 +1167,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool hasFlatGVSMode() const { return FlatGVSMode; }
+ // FLAT GLOBAL VOffset is signed
+ bool hasSignedGVSOffset() const { return GFX1250Insts; }
+
bool enableSIScheduler() const {
return EnableSIScheduler;
}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index d103423ae1675..95504052249e0 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -145,12 +145,13 @@ define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
; GCN: ; %bb.0: ; %bb
; GCN-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
+; GCN-NEXT: global_load_b32 v2, v0, s[0:1] scale_offset scope:SCOPE_SYS
; GCN-NEXT: s_wait_loadcnt 0x0
; GCN-NEXT: s_wait_xcnt 0x0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GCN-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
; GCN-NEXT: s_mov_b32 s0, exec_lo
; GCN-NEXT: v_cmpx_ne_u32_e32 0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index f54fbbaabe9f5..e6018e413a85d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -95,12 +95,24 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) {
}
define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) {
-; GFX1250-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0xff800000
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s3, s0
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388607
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, -1
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967295
%load = load i8, ptr %gep0
%zext = zext i8 %load to i32
@@ -551,12 +563,21 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
; Both 64-bit base and 32-bit offset are scalar
define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) {
-; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, s4
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%load = load i8, ptr %gep0
@@ -567,12 +588,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase,
; Both 64-bit base and 32-bit offset are scalar, with immediate offset.
define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) {
-; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, s4
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] offset:-24
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -24
@@ -584,12 +614,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr
; Both components uniform, zext forced to LHS of addressing expression
define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) {
-; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, s4
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3]
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%sbase.as.int = ptrtoint ptr %sbase to i64
%add = add i64 %zext.offset, %sbase.as.int
@@ -602,12 +641,21 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in
; Both components uniform, zext forced to LHS of addressing expression, with immediate offset
define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) {
-; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
-; GFX1250: ; %bb.0:
-; GFX1250-NEXT: v_mov_b32_e32 v0, s4
-; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, s4
+; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, s4
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 0
+; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[0:1] offset:128
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %soffset to i64
%sbase.as.int = ptrtoint ptr %sbase to i64
%add = add i64 %zext.offset, %sbase.as.int
@@ -686,33 +734,13 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3
; Cannot push the shift into 32-bits, and cannot match.
define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) {
-; GFX1250-SDAG-LABEL: flat_load_saddr_f32_natural_addressing:
-; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
-; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_saddr_f32_natural_addressing:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
-; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
-; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; return to shader part epilog
+; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
%voffset = load i32, ptr %voffset.ptr
%zext.offset = zext i32 %voffset to i64
%gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
@@ -743,8 +771,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, pt
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3]
+; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
%voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{}
@@ -760,8 +787,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400
+; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400 scale_offset
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
%voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{}
@@ -774,33 +800,13 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg
; Range is 1 beyond the limit where we can move the shift into 32-bits.
define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) {
-; GFX1250-SDAG-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
-; GFX1250-SDAG: ; %bb.0:
-; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
-; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3]
-; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT: ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
-; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1]
-; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
-; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1]
-; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT: ; return to shader part epilog
+; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: flat_load_b32 v0, v[0:1]
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: ; return to shader part epilog
%voffset = load i32, ptr %voffset.ptr, !range !1, !noundef !{}
%zext.offset = zext i32 %voffset to i64
%gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset
diff --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index 5d35adc8cbe0a..79907fd0c60bc 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -482,17 +482,16 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
; GCN-SDAG-LABEL: test_v7i16_load_store_kernel:
; GCN-SDAG: ; %bb.0:
; GCN-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GCN-SDAG-NEXT: s_wait_xcnt 0x0
; GCN-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GCN-SDAG-NEXT: v_mov_b64_e32 v[8:9], 12
; GCN-SDAG-NEXT: v_mov_b64_e32 v[10:11], 8
; GCN-SDAG-NEXT: v_mov_b64_e32 v[12:13], 0
-; GCN-SDAG-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GCN-SDAG-NEXT: s_wait_kmcnt 0x0
; GCN-SDAG-NEXT: s_clause 0x1
-; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1]
-; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3]
+; GCN-SDAG-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
+; GCN-SDAG-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset
; GCN-SDAG-NEXT: s_wait_loadcnt 0x0
; GCN-SDAG-NEXT: v_pk_add_u16 v3, v3, v7
; GCN-SDAG-NEXT: v_pk_add_u16 v2, v2, v6
@@ -509,21 +508,20 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
; GCN-GISEL-LABEL: test_v7i16_load_store_kernel:
; GCN-GISEL: ; %bb.0:
; GCN-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GCN-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GCN-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GCN-GISEL-NEXT: s_wait_xcnt 0x0
; GCN-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x10
; GCN-GISEL-NEXT: v_mov_b64_e32 v[8:9], 0
; GCN-GISEL-NEXT: v_mov_b64_e32 v[10:11], 2
; GCN-GISEL-NEXT: v_mov_b64_e32 v[12:13], 4
-; GCN-GISEL-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GCN-GISEL-NEXT: v_mov_b64_e32 v[14:15], 6
; GCN-GISEL-NEXT: v_mov_b64_e32 v[16:17], 8
; GCN-GISEL-NEXT: v_mov_b64_e32 v[18:19], 10
; GCN-GISEL-NEXT: v_mov_b64_e32 v[20:21], 12
; GCN-GISEL-NEXT: s_wait_kmcnt 0x0
; GCN-GISEL-NEXT: s_clause 0x1
-; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1]
-; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3]
+; GCN-GISEL-NEXT: global_load_b128 v[0:3], v4, s[0:1] scale_offset
+; GCN-GISEL-NEXT: global_load_b128 v[4:7], v4, s[2:3] scale_offset
; GCN-GISEL-NEXT: s_wait_loadcnt 0x0
; GCN-GISEL-NEXT: v_pk_add_u16 v0, v0, v4
; GCN-GISEL-NEXT: v_pk_add_u16 v1, v1, v5
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
new file mode 100644
index 0000000000000..64392a15e9a9b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -0,0 +1,436 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idx32(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b32_idx32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %arrayidx = getelementptr inbounds float, ptr %p, i32 %idx
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) {
+; SDAG-LABEL: flat_load_b32_idxprom_wrong_stride:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
+; SDAG-NEXT: flat_load_b32 v0, v[0:1]
+; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: flat_load_b32_idxprom_wrong_stride:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1]
+; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GISEL-NEXT: flat_load_b32 v0, v[0:1]
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b16_idxprom_ioffset(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b16_idxprom_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd
+ %ld = load i16, ptr %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @flat_load_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
+ %ret = load <2 x float>, ptr %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b96_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom
+ %ret = load <3 x float>, ptr %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxpromi_ioffset(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b96_idxpromi_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd
+ %ret = load <3 x float>, ptr %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @flat_load_b128_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_load_b128_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom
+ %ret = load <4 x float>, ptr %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b32_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b32_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b32_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b32 v0, v0, s[0:1] offset:64 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxadd
+ %ret = load float, ptr %arrayidx, align 4
+ ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @flat_load_b8_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b8_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_u8 v0, v0, s[0:1] offset:16
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i8, ptr %p, i64 %idxadd
+ %ld = load i8, ptr %arrayidx
+ %ret.i32 = zext i8 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b16_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b16_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom
+ %ld = load i16, ptr %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @flat_load_b16_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b16_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxadd
+ %ld = load i16, ptr %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @flat_load_b64_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b64_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr %p, i64 %idxprom
+ %ret = load <2 x float>, ptr %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b96_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxprom
+ %ret = load <3 x float>, ptr %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b96_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr %p, i64 %idxadd
+ %ret = load <3 x float>, ptr %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @flat_load_b128_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
+; GCN-LABEL: flat_load_b128_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: flat_load_b32 v0, v[0:1]
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: flat_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr %p, i64 %idxprom
+ %ret = load <4 x float>, ptr %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_store_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1.0
+; GCN-NEXT: flat_store_b32 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr %p, i64 %idxprom
+ store float 1.0, ptr %arrayidx, align 4
+ ret void
+}
+
+define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_store_b16_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: flat_store_b16 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr %p, i64 %idxprom
+ store i16 1, ptr %arrayidx, align 2
+ ret void
+}
+
+define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_store_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
+; GCN-NEXT: flat_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds double, ptr %p, i64 %idxprom
+ store double 1.0, ptr %arrayidx, align 4
+ ret void
+}
+
+define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: flat_atomicrmw_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i32, ptr %p, i64 %idxprom
+ atomicrmw add ptr %arrayidx, i32 1 monotonic
+ ret void
+}
+
+define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %p, i32 %idx) {
+; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; SDAG-NEXT: s_mov_b32 s0, exec_lo
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: s_wait_alu 0xfffe
+; SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3
+; SDAG-NEXT: s_xor_b32 s0, exec_lo, s0
+; SDAG-NEXT: s_cbranch_execnz .LBB21_3
+; SDAG-NEXT: ; %bb.1: ; %Flow
+; SDAG-NEXT: s_wait_alu 0xfffe
+; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
+; SDAG-NEXT: s_cbranch_execnz .LBB21_4
+; SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi
+; SDAG-NEXT: s_wait_alu 0xfffe
+; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: s_branch .LBB21_5
+; SDAG-NEXT: .LBB21_3: ; %atomicrmw.global
+; SDAG-NEXT: v_mov_b64_e32 v[0:1], 1
+; SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_wait_xcnt 0x0
+; SDAG-NEXT: s_wait_alu 0xfffe
+; SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0
+; SDAG-NEXT: s_cbranch_execz .LBB21_2
+; SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo
+; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: scratch_load_b64 v[0:1], v4, off
+; SDAG-NEXT: s_wait_loadcnt 0x0
+; SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; SDAG-NEXT: scratch_store_b64 v4, v[2:3], off
+; SDAG-NEXT: s_wait_xcnt 0x0
+; SDAG-NEXT: s_wait_alu 0xfffe
+; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; SDAG-NEXT: s_branch .LBB21_5
+; SDAG-NEXT: .LBB21_5:
+;
+; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: v_mov_b32_e32 v2, v0
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[2:3], src_private_base
+; GISEL-NEXT: s_mov_b32 s2, exec_lo
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[2:3]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, v0
+; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v5, v1, vcc_lo
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_cmpx_ne_u32_e64 s3, v5
+; GISEL-NEXT: s_xor_b32 s2, exec_lo, s2
+; GISEL-NEXT: s_cbranch_execnz .LBB21_3
+; GISEL-NEXT: ; %bb.1: ; %Flow
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2
+; GISEL-NEXT: s_cbranch_execnz .LBB21_4
+; GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_branch .LBB21_5
+; GISEL-NEXT: .LBB21_3: ; %atomicrmw.global
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], 1
+; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: flat_atomic_add_u64 v[0:1], v2, v[0:1], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GISEL-NEXT: s_wait_xcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_and_not1_saveexec_b32 s0, s2
+; GISEL-NEXT: s_cbranch_execz .LBB21_2
+; GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5]
+; GISEL-NEXT: s_wait_alu 0xfffd
+; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: scratch_load_b64 v[0:1], v4, off
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1
+; GISEL-NEXT: scratch_store_b64 v4, v[2:3], off
+; GISEL-NEXT: s_wait_xcnt 0x0
+; GISEL-NEXT: s_wait_alu 0xfffe
+; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GISEL-NEXT: s_branch .LBB21_5
+; GISEL-NEXT: .LBB21_5:
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i64, ptr %p, i64 %idxprom
+ %ret = atomicrmw add ptr %arrayidx, i64 1 monotonic
+ %ret.cast = bitcast i64 %ret to <2 x float>
+ ret <2 x float> %ret.cast
+}
+
+!0 = !{i32 0, i32 1024}
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
new file mode 100644
index 0000000000000..faea84e34d7eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
@@ -0,0 +1,351 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefixes=GCN,GISEL %s
+
+define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b32_idx32:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i32 %idx
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; SDAG-LABEL: global_load_b32_idxprom_wrong_stride:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
+; SDAG-NEXT: global_load_b32 v0, v[0:1], off
+; SDAG-NEXT: s_wait_loadcnt 0x0
+; SDAG-NEXT: ; return to shader part epilog
+;
+; GISEL-LABEL: global_load_b32_idxprom_wrong_stride:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 3, v[0:1]
+; GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo
+; GISEL-NEXT: global_load_b32 v0, v[0:1], off
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b16_idxprom_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(1) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b96_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b96_idxpromi_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
+ %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_load_b128_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b32_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b32 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b32_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b32 v0, v0, s[0:1] offset:64 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxadd
+ %ret = load float, ptr addrspace(1) %arrayidx, align 4
+ ret float %ret
+}
+
+; Note: this is a byte load, there is nothing to scale
+
+define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b8_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_u8 v0, v0, s[0:1] offset:16
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %idxadd
+ %ld = load i8, ptr addrspace(1) %arrayidx
+ %ret.i32 = zext i8 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b16_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_u16 v0, v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
+ %ld = load i16, ptr addrspace(1) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b16_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxadd
+ %ld = load i16, ptr addrspace(1) %arrayidx, align 2
+ %ret.i32 = zext i16 %ld to i32
+ %ret = bitcast i32 %ret.i32 to float
+ ret float %ret
+}
+
+define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b64_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b64 v[0:1], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <2 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <2 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <2 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b96_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b96_idxprom_range_ioffset:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %idxadd = add i64 %idxprom, 16
+ %arrayidx = getelementptr inbounds [3 x float], ptr addrspace(1) %p, i64 %idxadd
+ %ret = load <3 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <3 x float> %ret
+}
+
+define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
+; GCN-LABEL: global_load_b128_idxprom_range:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: global_load_b32 v0, v[0:1], off
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: global_load_b128 v[0:3], v0, s[0:1] scale_offset
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idx = load i32, ptr addrspace(1) %pp, align 4, !range !0
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds <4 x float>, ptr addrspace(1) %p, i64 %idxprom
+ %ret = load <4 x float>, ptr addrspace(1) %arrayidx, align 4
+ ret <4 x float> %ret
+}
+
+define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_store_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1.0
+; GCN-NEXT: global_store_b32 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds float, ptr addrspace(1) %p, i64 %idxprom
+ store float 1.0, ptr addrspace(1) %arrayidx, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) {
+; GCN-LABEL: global_store_b16_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: global_store_b16 v0, v1, s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %p, i64 %idxprom
+ store i16 1, ptr addrspace(1) %arrayidx, align 2
+ ret void
+}
+
+define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_store_b64_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b64_e32 v[2:3], 1.0
+; GCN-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds double, ptr addrspace(1) %p, i64 %idxprom
+ store double 1.0, ptr addrspace(1) %arrayidx, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
+; GCN-LABEL: global_atomicrmw_b32_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v1, 1
+; GCN-NEXT: global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
+; GCN-NEXT: s_endpgm
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %idxprom
+ atomicrmw add ptr addrspace(1) %arrayidx, i32 1 monotonic
+ ret void
+}
+
+define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) {
+; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b64_e32 v[2:3], 1
+; GCN-NEXT: global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %idxprom = sext i32 %idx to i64
+ %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %p, i64 %idxprom
+ %ret = atomicrmw add ptr addrspace(1) %arrayidx, i64 1 monotonic
+ %ret.cast = bitcast i64 %ret to <2 x float>
+ ret <2 x float> %ret.cast
+}
+
+!0 = !{i32 0, i32 1024}
More information about the llvm-commits
mailing list