[llvm] 13c0316 - [AMDGPU] Restrict immediate scratch offsets

Sebastian Neubauer via llvm-commits llvm-commits at lists.llvm.org
Fri May 7 06:01:20 PDT 2021


Author: Sebastian Neubauer
Date: 2021-05-07T14:51:32+02:00
New Revision: 13c0316239dc31a34262f2270d0952aa152a9a76

URL: https://github.com/llvm/llvm-project/commit/13c0316239dc31a34262f2270d0952aa152a9a76
DIFF: https://github.com/llvm/llvm-project/commit/13c0316239dc31a34262f2270d0952aa152a9a76.diff

LOG: [AMDGPU] Restrict immediate scratch offsets

gfx9 does not work with negative offsets, gfx10 works only with
aligned negative offsets, but not with unaligned negative offsets.

This is slightly more conservative than needed, gfx9 does support
negative offsets when a VGPR address is used and gfx10 supports
negative, unaligned offsets when an SGPR address is used, but we
do not make use of that with this patch.

Differential Revision: https://reviews.llvm.org/D101292

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.td
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
    llvm/lib/Target/AMDGPU/GCNSubtarget.h
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index d08efec787e9..3593412444b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -232,6 +232,18 @@ def FeatureFlatSegmentOffsetBug : SubtargetFeature<"flat-segment-offset-bug",
   "GFX10 bug where inst_offset is ignored when flat instructions access global memory"
 >;
 
+def FeatureNegativeScratchOffsetBug : SubtargetFeature<"negative-scratch-offset-bug",
+  "NegativeScratchOffsetBug",
+  "true",
+  "Negative immediate offsets in scratch instructions with an SGPR offset page fault on GFX9"
+>;
+
+def FeatureNegativeUnalignedScratchOffsetBug : SubtargetFeature<"negative-unaligned-scratch-offset-bug",
+  "NegativeUnalignedScratchOffsetBug",
+  "true",
+  "Scratch instructions with a VGPR offset and a negative immediate offset that is not a multiple of 4 read wrong memory on GFX10"
+>;
+
 def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
   "HasOffset3fBug",
   "true",
@@ -771,7 +783,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
    FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
    FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
-   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
+   FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+   FeatureNegativeScratchOffsetBug
   ]
 >;
 
@@ -988,7 +1001,8 @@ def FeatureGroup {
     FeatureLdsBranchVmemWARHazard,
     FeatureNSAtoVMEMBug,
     FeatureOffset3fBug,
-    FeatureFlatSegmentOffsetBug
+    FeatureFlatSegmentOffsetBug,
+    FeatureNegativeUnalignedScratchOffsetBug
    ];
 }
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 6d307dbc7252..24e35d3f3e88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1669,7 +1669,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
     SDValue N0, N1;
     if (isBaseWithConstantOffset64(Addr, N0, N1)) {
-      uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+      int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
       if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
@@ -1911,17 +1911,11 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
 
   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
                               SIInstrFlags::FlatScratch)) {
-    const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
-    // Use signed division by a power of two to truncate towards 0.
-    int64_t D = 1LL << (NumBits - 1);
-    int64_t RemainderOffset = (COffsetVal / D) * D;
-    int64_t ImmField = COffsetVal - RemainderOffset;
+    int64_t SplitImmOffset, RemainderOffset;
+    std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
+        COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
 
-    assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS,
-                                  SIInstrFlags::FlatScratch));
-    assert(RemainderOffset + ImmField == COffsetVal);
-
-    COffsetVal = ImmField;
+    COffsetVal = SplitImmOffset;
 
     SDLoc DL(N);
     SDValue AddOffset =

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 4b2ec0118ec4..d9b99e4c2236 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -195,7 +195,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
   { }
 
 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
-                           const GCNTargetMachine &TM) :
+                           const GCNTargetMachine &TM)
+    : // clang-format off
     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
     AMDGPUSubtarget(TT),
     TargetTriple(TT),
@@ -238,6 +239,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     GFX10_3Insts(false),
     GFX7GFX8GFX9Insts(false),
     SGPRInitBug(false),
+    NegativeScratchOffsetBug(false),
+    NegativeUnalignedScratchOffsetBug(false),
     HasSMemRealTime(false),
     HasIntClamp(false),
     HasFmaMixInsts(false),
@@ -312,6 +315,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
     TLInfo(TM, *this),
     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
+  // clang-format on
   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));

diff  --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 00ea653cb710..1203d6f071c4 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -113,6 +113,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool GFX10_3Insts;
   bool GFX7GFX8GFX9Insts;
   bool SGPRInitBug;
+  bool NegativeScratchOffsetBug;
+  bool NegativeUnalignedScratchOffsetBug;
   bool HasSMemRealTime;
   bool HasIntClamp;
   bool HasFmaMixInsts;
@@ -890,6 +892,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return SGPRInitBug;
   }
 
+  bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
+
+  bool hasNegativeUnalignedScratchOffsetBug() const {
+    return NegativeUnalignedScratchOffsetBug;
+  }
+
   bool hasMFMAInlineLiteralBug() const {
     return HasMFMAInlineLiteralBug;
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fb0dcb67f759..fbc12cba37c1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7368,6 +7368,36 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
   return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
 }
 
+// Depending on the used address space and instructions, some immediate offsets
+// are allowed and some are not.
+// In general, flat instruction offsets can only be non-negative, global and
+// scratch instruction offsets can also be negative.
+//
+// There are several bugs related to these offsets:
+// On gfx10.1, flat instructions that go into the global address space cannot
+// use an offset.
+//
+// For scratch instructions, the address can be either an SGPR or a VGPR.
+// The following offsets can be used, depending on the architecture (x means
+// cannot be used):
+// +----------------------------+------+------+
+// | Address-Mode               | SGPR | VGPR |
+// +----------------------------+------+------+
+// | gfx9                       |      |      |
+// | negative, 4-aligned offset | x    | ok   |
+// | negative, unaligned offset | x    | ok   |
+// +----------------------------+------+------+
+// | gfx10                      |      |      |
+// | negative, 4-aligned offset | ok   | ok   |
+// | negative, unaligned offset | ok   | x    |
+// +----------------------------+------+------+
+// | gfx10.3                    |      |      |
+// | negative, 4-aligned offset | ok   | ok   |
+// | negative, unaligned offset | ok   | ok   |
+// +----------------------------+------+------+
+//
+// This function ignores the addressing mode, so if an offset cannot be used in
+// one addressing mode, it is considered illegal.
 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
                                     uint64_t FlatVariant) const {
   // TODO: Should 0 be special cased?
@@ -7380,22 +7410,44 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
     return false;
 
   bool Signed = FlatVariant != SIInstrFlags::FLAT;
+  if (ST.hasNegativeScratchOffsetBug() &&
+      FlatVariant == SIInstrFlags::FlatScratch)
+    Signed = false;
+  if (ST.hasNegativeUnalignedScratchOffsetBug() &&
+      FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
+      (Offset % 4) != 0) {
+    return false;
+  }
+
   unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
   return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
 }
 
+// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
 std::pair<int64_t, int64_t>
 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
                              uint64_t FlatVariant) const {
   int64_t RemainderOffset = COffsetVal;
   int64_t ImmField = 0;
   bool Signed = FlatVariant != SIInstrFlags::FLAT;
+  if (ST.hasNegativeScratchOffsetBug() &&
+      FlatVariant == SIInstrFlags::FlatScratch)
+    Signed = false;
+
   const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed);
   if (Signed) {
     // Use signed division by a power of two to truncate towards 0.
     int64_t D = 1LL << (NumBits - 1);
     RemainderOffset = (COffsetVal / D) * D;
     ImmField = COffsetVal - RemainderOffset;
+
+    if (ST.hasNegativeUnalignedScratchOffsetBug() &&
+        FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
+        (ImmField % 4) != 0) {
+      // Make ImmField a multiple of 4
+      RemainderOffset += ImmField % 4;
+      ImmField -= ImmField % 4;
+    }
   } else if (COffsetVal >= 0) {
     ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
     RemainderOffset = COffsetVal - ImmField;

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 4e7c299639c7..f3b61768878d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
 
 define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-LABEL: zero_init_kernel:
@@ -74,30 +75,62 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: zero_init_kernel:
-; GFX10-PAL:       ; %bb.0:
-; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: zero_init_kernel:
+; GFX1010-PAL:       ; %bb.0:
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:64
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: zero_init_kernel:
+; GFX1030-PAL:       ; %bb.0:
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:64
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
+; GFX1030-PAL-NEXT:    s_endpgm
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
   call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
@@ -650,32 +683,67 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: zero_init_small_offset_kernel:
-; GFX10-PAL:       ; %bb.0:
-; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
+; GFX1010-PAL:       ; %bb.0:
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:272
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:288
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:304
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:320
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
+; GFX1030-PAL:       ; %bb.0:
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:320
+; GFX1030-PAL-NEXT:    s_endpgm
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
@@ -844,32 +912,60 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
-; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
-; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
-; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT:    s_add_u32 s0, 0x104, s0
-; GFX10-PAL-NEXT:    s_add_u32 s1, 0x104, s1
-; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
+; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
+; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX1010-PAL-NEXT:    s_add_u32 s0, 0x104, s0
+; GFX1010-PAL-NEXT:    s_add_u32 s1, 0x104, s1
+; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
+; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
+; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX1030-PAL-NEXT:    s_add_u32 s0, 0x104, s0
+; GFX1030-PAL-NEXT:    s_add_u32 s1, 0x104, s1
+; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -950,30 +1046,56 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
-; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT:    s_add_u32 s0, 0x104, s0
-; GFX10-PAL-NEXT:    s_add_u32 s1, 0x104, s1
-; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX1010-PAL-NEXT:    s_add_u32 s0, 0x104, s0
+; GFX1010-PAL-NEXT:    s_add_u32 s1, 0x104, s1
+; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX1030-PAL-NEXT:    s_add_u32 s0, 0x104, s0
+; GFX1030-PAL-NEXT:    s_add_u32 s1, 0x104, s1
+; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1051,29 +1173,54 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
-; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX10-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX1010-PAL-NEXT:    scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX1030-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1260,36 +1407,71 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: zero_init_large_offset_kernel:
-; GFX10-PAL:       ; %bb.0:
-; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
-; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
-; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
-; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX10-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
+; GFX1010-PAL:       ; %bb.0:
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:16 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
+; GFX1030-PAL:       ; %bb.0:
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:16 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1030-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1030-PAL-NEXT:    s_endpgm
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
@@ -1373,30 +1555,58 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-PAL-LABEL: zero_init_large_offset_foo:
-; GFX10-PAL:       ; %bb.0:
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
-; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
-; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX10-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
+; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
+; GFX1010-PAL:       ; %bb.0:
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX1010-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
+; GFX1030-PAL:       ; %bb.0:
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX1030-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX1030-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX1030-PAL-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
@@ -1474,32 +1684,60 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_getpc_b64 s[4:5]
-; GFX10-PAL-NEXT:    s_mov_b32 s4, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s4, s4, s3
-; GFX10-PAL-NEXT:    s_addc_u32 s5, s5, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
-; GFX10-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
-; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
-; GFX10-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
-; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[4:5]
+; GFX1010-PAL-NEXT:    s_mov_b32 s4, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s4, s4, s3
+; GFX1010-PAL-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX1010-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
+; GFX1010-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
+; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[4:5]
+; GFX1030-PAL-NEXT:    s_mov_b32 s4, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s4, s4, s3
+; GFX1030-PAL-NEXT:    s_addc_u32 s5, s5, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX1030-PAL-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX1030-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
+; GFX1030-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
+; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1580,30 +1818,56 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
-; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
-; GFX10-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
-; GFX10-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
-; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX1010-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
+; GFX1010-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
+; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX1030-PAL-NEXT:    s_add_u32 s0, 0x4004, s0
+; GFX1030-PAL-NEXT:    s_add_u32 s1, 0x4004, s1
+; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1681,29 +1945,54 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
-; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 15
-; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
-; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GFX10-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    scratch_store_dword v2, v3, off
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX1010-PAL-NEXT:    scratch_load_dword v1, off, vcc_lo offset:4 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, 15
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX1030-PAL-NEXT:    scratch_load_dword v1, off, off offset:4 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    scratch_store_dword v2, v3, off
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -1866,28 +2155,52 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
-; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel:
-; GFX10-PAL:       ; %bb.0: ; %bb
-; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
-; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
-; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
-; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
-; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
-; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
-; GFX10-PAL-NEXT:    s_add_u32 s0, 4, s0
-; GFX10-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
-; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
-; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-PAL-NEXT:    s_endpgm
+; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 13
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x3800
+; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    s_add_u32 s0, 4, s0
+; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_endpgm
+;
+; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
+; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
+; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
+; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 13
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 15
+; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x3800
+; GFX1030-PAL-NEXT:    s_add_u32 s0, 4, s0
+; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
   %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
@@ -2278,5 +2591,133 @@ bb:
   ret void
 }
 
+define void @store_load_i32_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
+; GFX9-LABEL: store_load_i32_negative_unaligned:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
+; GFX9-NEXT:    scratch_store_byte v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_i32_negative_unaligned:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 1
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX9-PAL:       ; %bb.0: ; %bb
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -1, v0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -1
+  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
+  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
+  ret void
+}
+
+define void @store_load_i32_large_negative_unaligned(i8 addrspace(5)* nocapture %arg) {
+; GFX9-LABEL: store_load_i32_large_negative_unaligned:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 1
+; GFX9-NEXT:    scratch_store_byte v0, v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_i32_large_negative_unaligned:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 1
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX9-PAL:       ; %bb.0: ; %bb
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
+; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX1010-PAL:       ; %bb.0: ; %bb
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
+; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
+; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
+; GFX1030-PAL:       ; %bb.0: ; %bb
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
+; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
+; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
+; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %ptr = getelementptr inbounds i8, i8 addrspace(5)* %arg, i32 -4225
+  store volatile i8 1, i8 addrspace(5)* %ptr, align 1
+  %load = load volatile i8, i8 addrspace(5)* %ptr, align 1
+  ret void
+}
+
 declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
 declare i32 @llvm.amdgcn.workitem.id.x()

diff  --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index fae05d1403eb..51dbbed60fa5 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -279,30 +279,31 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(<3 x i64> addrspace(1
 ; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; FLATSCR-NEXT:    s_add_u32 s2, 16, 0x4000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
-; FLATSCR-NEXT:    s_movk_i32 vcc_hi, 0x2000
-; FLATSCR-NEXT:    s_mov_b32 s3, 0
-; FLATSCR-NEXT:    scratch_store_dword off, v0, vcc_hi
+; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT:    s_mov_b32 s2, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, vcc_hi offset:1024
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:  BB2_1: ; %loadstoreloop
 ; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
-; FLATSCR-NEXT:    s_add_u32 s4, 0x4000, s3
-; FLATSCR-NEXT:    s_add_i32 s3, s3, 1
-; FLATSCR-NEXT:    s_cmpk_lt_u32 s3, 0x2120
-; FLATSCR-NEXT:    scratch_store_byte off, v0, s4
+; FLATSCR-NEXT:    s_add_u32 s3, 0x2000, s2
+; FLATSCR-NEXT:    s_add_i32 s2, s2, 1
+; FLATSCR-NEXT:    s_cmpk_lt_u32 s2, 0x2120
+; FLATSCR-NEXT:    scratch_store_byte off, v0, s3
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB2_1
 ; FLATSCR-NEXT:  ; %bb.2: ; %split
-; FLATSCR-NEXT:    s_movk_i32 s3, 0x1000
-; FLATSCR-NEXT:    s_add_u32 s3, 0x4000, s3
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s3 offset:720 glc
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x1000
+; FLATSCR-NEXT:    s_add_u32 s2, 0x2000, s2
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s2 offset:720 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s3 offset:704 glc
+; FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s2 offset:704 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s2 glc
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x2000
+; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s2 offset:16 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    scratch_load_dwordx4 v[4:7], off, s2 offset:-16 glc
+; FLATSCR-NEXT:    s_movk_i32 s2, 0x2000
+; FLATSCR-NEXT:    scratch_load_dwordx4 v[4:7], off, s2 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_mov_b32_e32 v12, 0
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6


        


More information about the llvm-commits mailing list