[llvm] 7a18bdb - [AMDGPU] Implement flat scratch init for pal
Sebastian Neubauer via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 20 02:14:45 PST 2020
Author: Sebastian Neubauer
Date: 2020-11-20T11:14:30+01:00
New Revision: 7a18bdb350e4a0318776cf66cf280b386eb9c3e8
URL: https://github.com/llvm/llvm-project/commit/7a18bdb350e4a0318776cf66cf280b386eb9c3e8
DIFF: https://github.com/llvm/llvm-project/commit/7a18bdb350e4a0318776cf66cf280b386eb9c3e8.diff
LOG: [AMDGPU] Implement flat scratch init for pal
Extract the scratch offset from the scratch buffer descriptor that is
stored in the global table.
Differential Revision: https://reviews.llvm.org/D91701
Added:
Modified:
llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
llvm/lib/Target/AMDGPU/SIRegisterInfo.h
llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir
llvm/test/CodeGen/AMDGPU/flat-scratch.ll
llvm/test/CodeGen/AMDGPU/scratch-simple.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 367bca1ce2b7..18ad8e1ce35f 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -296,6 +296,31 @@ static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
.addMemOperand(MMO);
}
+static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+ const DebugLoc &DL, const SIInstrInfo *TII,
+ Register TargetReg) {
+ MachineFunction *MF = MBB.getParent();
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+ const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+ const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
+ Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
+ Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
+
+ if (MFI->getGITPtrHigh() != 0xffffffff) {
+ BuildMI(MBB, I, DL, SMovB32, TargetHi)
+ .addImm(MFI->getGITPtrHigh())
+ .addReg(TargetReg, RegState::ImplicitDefine);
+ } else {
+ const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
+ BuildMI(MBB, I, DL, GetPC64, TargetReg);
+ }
+ Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
+ MF->getRegInfo().addLiveIn(GitPtrLo);
+ MBB.addLiveIn(GitPtrLo);
+ BuildMI(MBB, I, DL, SMovB32, TargetLo)
+ .addReg(GitPtrLo);
+}
+
// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
void SIFrameLowering::emitEntryFunctionFlatScratchInit(
MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -315,16 +340,74 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
// pointer. Because we only detect if flat instructions are used at all,
// this will be used more often than necessary on VI.
- Register FlatScratchInitReg =
- MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
- assert(FlatScratchInitReg);
+ Register FlatScrInitLo;
+ Register FlatScrInitHi;
- MachineRegisterInfo &MRI = MF.getRegInfo();
- MRI.addLiveIn(FlatScratchInitReg);
- MBB.addLiveIn(FlatScratchInitReg);
+ if (ST.isAmdPalOS()) {
+ // Extract the scratch offset from the descriptor in the GIT
+ LivePhysRegs LiveRegs;
+ LiveRegs.init(*TRI);
+ LiveRegs.addLiveIns(MBB);
+
+ // Find unused reg to load flat scratch init into
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register FlatScrInit = AMDGPU::NoRegister;
+ ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
+ unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
+ AllSGPR64s = AllSGPR64s.slice(
+ std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
+ Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
+ for (MCPhysReg Reg : AllSGPR64s) {
+ if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
+ !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
+ FlatScrInit = Reg;
+ break;
+ }
+ }
+ assert(FlatScrInit && "Failed to find free register for scratch init");
- Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
- Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
+ FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
+
+ buildGitPtr(MBB, I, DL, TII, FlatScrInit);
+
+ // We now have the GIT ptr - now get the scratch descriptor from the entry
+ // at offset 0 (or offset 16 for a compute shader).
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
+ const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
+ auto *MMO = MF.getMachineMemOperand(
+ PtrInfo,
+ MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
+ MachineMemOperand::MODereferenceable,
+ 8, Align(4));
+ unsigned Offset =
+ MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
+ const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+ unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
+ BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
+ .addReg(FlatScrInit)
+ .addImm(EncodedOffset) // offset
+ .addImm(0) // glc
+ .addImm(0) // dlc
+ .addMemOperand(MMO);
+
+ // Mask the offset in [47:0] of the descriptor
+ const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
+ BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
+ .addReg(FlatScrInitHi)
+ .addImm(0xffff);
+ } else {
+ Register FlatScratchInitReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+ assert(FlatScratchInitReg);
+
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ MRI.addLiveIn(FlatScratchInitReg);
+ MBB.addLiveIn(FlatScratchInitReg);
+
+ FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ }
// Do a 64-bit pointer add.
if (ST.flatScratchIsPointer()) {
@@ -582,26 +665,9 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
if (ST.isAmdPalOS()) {
// The pointer to the GIT is formed from the offset passed in and either
// the amdgpu-git-ptr-high function attribute or the top part of the PC
- Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
- const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
-
- if (MFI->getGITPtrHigh() != 0xffffffff) {
- BuildMI(MBB, I, DL, SMovB32, RsrcHi)
- .addImm(MFI->getGITPtrHigh())
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
- } else {
- const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
- BuildMI(MBB, I, DL, GetPC64, Rsrc01);
- }
- Register GitPtrLo = MFI->getGITPtrLoReg(MF);
- MF.getRegInfo().addLiveIn(GitPtrLo);
- MBB.addLiveIn(GitPtrLo);
- BuildMI(MBB, I, DL, SMovB32, RsrcLo)
- .addReg(GitPtrLo)
- .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+ buildGitPtr(MBB, I, DL, TII, Rsrc01);
// We now have the GIT ptr - now get the scratch descriptor from the entry
// at offset 0 (or offset 16 for a compute shader).
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9f81d534e5fa..26b2150703e3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2030,7 +2030,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchIDReg);
}
- if (Info.hasFlatScratchInit()) {
+ if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 159a678f8388..71db374081e0 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2151,6 +2151,12 @@ SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
ST.getMaxNumSGPRs(MF) / 4);
}
+ArrayRef<MCPhysReg>
+SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
+ return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
+ ST.getMaxNumSGPRs(MF) / 2);
+}
+
ArrayRef<MCPhysReg>
SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 8a92d676d1d2..5afde2cdcde5 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -324,6 +324,10 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
+ /// Return all SGPR64 which satisfy the waves per execution unit requirement
+ /// of the subtarget.
+ ArrayRef<MCPhysReg> getAllSGPR64(const MachineFunction &MF) const;
+
/// Return all SGPR32 which satisfy the waves per execution unit requirement
/// of the subtarget.
ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
diff --git a/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir b/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir
index 054e022949d9..3737d3852dfa 100644
--- a/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir
+++ b/llvm/test/CodeGen/AMDGPU/SRSRC-GIT-clobber-check.mir
@@ -23,7 +23,7 @@ machineFunctionInfo:
body: |
; CHECK: $sgpr1 = COPY killed $sgpr5
; CHECK: $sgpr4_sgpr5 = S_GETPC_B64
- ; CHECK: $sgpr4 = S_MOV_B32 $sgpr8, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ ; CHECK: $sgpr4 = S_MOV_B32 $sgpr8
; CHECK: $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM $sgpr4_sgpr5, 0, 0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 :: (dereferenceable invariant load 16, align 4, addrspace 4)
bb.0:
successors: %bb.1, %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index fbe664f85cff..09b9f3430d3c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10-PAL %s
define amdgpu_kernel void @zero_init_kernel() {
; GFX9-LABEL: zero_init_kernel:
@@ -45,6 +47,59 @@ define amdgpu_kernel void @zero_init_kernel() {
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: zero_init_kernel:
+; GFX9-PAL: ; %bb.0:
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: s_mov_b32 s1, s0
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_mov_b32 s3, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:64
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: zero_init_kernel:
+; GFX10-PAL: ; %bb.0:
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: s_mov_b32 s0, 0
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: s_mov_b32 s1, s0
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_mov_b32 s3, s0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:64
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:48
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:32
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:16
+; GFX10-PAL-NEXT: s_endpgm
%alloca = alloca [32 x i16], align 2, addrspace(5)
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
@@ -89,6 +144,44 @@ define void @zero_init_foo() {
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: zero_init_foo:
+; GFX9-PAL: ; %bb.0:
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-PAL-NEXT: s_mov_b32 s1, s0
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_mov_b32 s3, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: zero_init_foo:
+; GFX10-PAL: ; %bb.0:
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_mov_b32 s0, 0
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: s_mov_b32 s1, s0
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_mov_b32 s3, s0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca [32 x i16], align 2, addrspace(5)
%cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
@@ -129,6 +222,49 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
; GFX10-NEXT: scratch_store_dword off, v0, s0
; GFX10-NEXT: scratch_load_dword v0, off, s1
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_sindex_kernel:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX9-PAL-NEXT: s_mov_b32 s4, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
+; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
+; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_sindex_kernel:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX10-PAL-NEXT: s_mov_b32 s4, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3
+; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0
+; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1
+; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1
+; GFX10-PAL-NEXT: s_endpgm
bb:
%i = alloca [32 x float], align 4, addrspace(5)
%i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -172,6 +308,46 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
; GFX10-NEXT: scratch_store_dword off, v0, s1
; GFX10-NEXT: scratch_load_dword v0, off, s0
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_sindex_foo:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
+; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT: s_add_u32 s1, 4, s1
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
+; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_sindex_foo:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0
+; GFX10-PAL-NEXT: s_add_u32 s1, 4, s1
+; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1
+; GFX10-PAL-NEXT: s_endpgm
bb:
%i = alloca [32 x float], align 4, addrspace(5)
%i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -213,6 +389,44 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
; GFX10-NEXT: scratch_store_dword v2, v3, off
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_vindex_kernel:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4
+; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0
+; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_vindex_kernel:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4
+; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124
+; GFX10-PAL-NEXT: s_endpgm
bb:
%i = alloca [32 x float], align 4, addrspace(5)
%i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -257,6 +471,35 @@ define void @store_load_vindex_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_vindex_foo:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
+; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3
+; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: store_load_vindex_foo:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s32
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1
+; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2
+; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2
+; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [32 x float], align 4, addrspace(5)
%i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -288,6 +531,24 @@ define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: private_ptr_foo:
+; GFX9-PAL: ; %bb.0:
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000
+; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:4
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: private_ptr_foo:
+; GFX10-PAL: ; %bb.0:
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:4
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
store float 1.000000e+01, float addrspace(5)* %gep, align 4
ret void
@@ -341,6 +602,64 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
+; GFX9-PAL: ; %bb.0:
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-PAL-NEXT: s_mov_b32 s1, s0
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_mov_b32 s3, s0
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:272
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:288
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:304
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:320
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: zero_init_small_offset_kernel:
+; GFX10-PAL: ; %bb.0:
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4
+; GFX10-PAL-NEXT: s_mov_b32 s0, 0
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: s_mov_b32 s1, s0
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_mov_b32 s3, s0
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:272
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:288
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:304
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:320
+; GFX10-PAL-NEXT: s_endpgm
%padding = alloca [64 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
%pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
@@ -392,6 +711,48 @@ define void @zero_init_small_offset_foo() {
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: zero_init_small_offset_foo:
+; GFX9-PAL: ; %bb.0:
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32
+; GFX9-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-PAL-NEXT: s_mov_b32 s1, s0
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_mov_b32 s3, s0
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: zero_init_small_offset_foo:
+; GFX10-PAL: ; %bb.0:
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32
+; GFX10-PAL-NEXT: s_mov_b32 s0, 0
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: s_mov_b32 s1, s0
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_mov_b32 s3, s0
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [64 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
%pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
@@ -440,6 +801,54 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
; GFX10-NEXT: scratch_store_dword off, v0, s0
; GFX10-NEXT: scratch_load_dword v0, off, s1
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX9-PAL-NEXT: s_mov_b32 s4, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
+; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_sindex_small_offset_kernel:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX10-PAL-NEXT: s_mov_b32 s4, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3
+; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0
+; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1
+; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1
+; GFX10-PAL-NEXT: s_endpgm
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -491,6 +900,51 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
; GFX10-NEXT: scratch_store_dword off, v0, s1
; GFX10-NEXT: scratch_load_dword v0, off, s0
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT: s_add_u32 s1, 0x104, s1
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
+; GFX9-PAL-NEXT: s_add_u32 s0, 0x104, s0
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_sindex_small_offset_foo:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4
+; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX10-PAL-NEXT: s_add_u32 s0, 0x104, s0
+; GFX10-PAL-NEXT: s_add_u32 s1, 0x104, s1
+; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1
+; GFX10-PAL-NEXT: s_endpgm
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -539,6 +993,48 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
; GFX10-NEXT: scratch_store_dword v2, v3, off
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0
+; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0
+; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_vindex_small_offset_kernel:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x104
+; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4
+; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124
+; GFX10-PAL-NEXT: s_endpgm
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -591,6 +1087,40 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32
+; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x100
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
+; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3
+; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x100
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo
+; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1
+; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2
+; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2
+; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32
+; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [64 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -659,6 +1189,68 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
; GFX10-NEXT: s_movk_i32 vcc_lo, 0x4010
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
+; GFX9-PAL: ; %bb.0:
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-PAL-NEXT: s_mov_b32 s1, s0
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_mov_b32 s3, s0
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi
+; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
+; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
+; GFX9-PAL-NEXT: s_movk_i32 vcc_hi, 0x4010
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: zero_init_large_offset_kernel:
+; GFX10-PAL: ; %bb.0:
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4
+; GFX10-PAL-NEXT: s_mov_b32 s0, 0
+; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX10-PAL-NEXT: s_mov_b32 s1, s0
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_mov_b32 s3, s0
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX10-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX10-PAL-NEXT: s_endpgm
%padding = alloca [4096 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
%pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
@@ -718,6 +1310,56 @@ define void @zero_init_large_offset_foo() {
; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: zero_init_large_offset_foo:
+; GFX9-PAL: ; %bb.0:
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32
+; GFX9-PAL-NEXT: s_mov_b32 s0, 0
+; GFX9-PAL-NEXT: s_mov_b32 s1, s0
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_mov_b32 s3, s0
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi
+; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:16
+; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:32
+; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: zero_init_large_offset_foo:
+; GFX10-PAL: ; %bb.0:
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32
+; GFX10-PAL-NEXT: s_mov_b32 s0, 0
+; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-PAL-NEXT: s_mov_b32 s1, s0
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_mov_b32 s3, s0
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo
+; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:16
+; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
+; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
%padding = alloca [4096 x i32], align 4, addrspace(5)
%alloca = alloca [32 x i16], align 2, addrspace(5)
%pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
@@ -766,6 +1408,54 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
; GFX10-NEXT: scratch_store_dword off, v0, s0
; GFX10-NEXT: scratch_load_dword v0, off, s1
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX9-PAL-NEXT: s_mov_b32 s4, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
+; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_sindex_large_offset_kernel:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX10-PAL-NEXT: s_mov_b32 s4, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3
+; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0
+; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1
+; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1
+; GFX10-PAL-NEXT: s_endpgm
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -817,6 +1507,51 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
; GFX10-NEXT: scratch_store_dword off, v0, s1
; GFX10-NEXT: scratch_load_dword v0, off, s0
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
+; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-PAL-NEXT: s_add_u32 s1, 0x4004, s1
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
+; GFX9-PAL-NEXT: s_add_u32 s0, 0x4004, s0
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_sindex_large_offset_foo:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, off offset:4
+; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
+; GFX10-PAL-NEXT: s_add_u32 s0, 0x4004, s0
+; GFX10-PAL-NEXT: s_add_u32 s1, 0x4004, s1
+; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1
+; GFX10-PAL-NEXT: s_endpgm
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -865,6 +1600,48 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
; GFX10-NEXT: scratch_store_dword v2, v3, off
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:124
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: scratch_load_dword v1, off, vcc_hi offset:4
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX9-PAL-NEXT: v_add_u32_e32 v2, v1, v0
+; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT: v_sub_u32_e32 v0, v1, v0
+; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:124
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_vindex_large_offset_kernel:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x4004
+; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX10-PAL-NEXT: v_add_nc_u32_e32 v2, v1, v0
+; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v0, v1, v0
+; GFX10-PAL-NEXT: scratch_load_dword v1, off, off offset:4
+; GFX10-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:124
+; GFX10-PAL-NEXT: s_endpgm
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -917,6 +1694,40 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32
+; GFX9-PAL-NEXT: s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, vcc_hi
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
+; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
+; GFX9-PAL-NEXT: v_and_b32_e32 v0, v0, v3
+; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
+; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT: s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, vcc_lo
+; GFX10-PAL-NEXT: v_and_b32_e32 v3, v0, v1
+; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v2
+; GFX10-PAL-NEXT: v_lshl_add_u32 v2, v3, 2, v2
+; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32
+; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off
+; GFX10-PAL-NEXT: scratch_load_dword v0, v2, off
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%padding = alloca [4096 x i32], align 4, addrspace(5)
%i = alloca [32 x float], align 4, addrspace(5)
@@ -962,6 +1773,45 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX9-PAL-NEXT: s_mov_b32 s2, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, vcc_hi offset:4
+; GFX9-PAL-NEXT: s_add_u32 s0, 4, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_large_imm_offset_kernel:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
+; GFX10-PAL-NEXT: s_mov_b32 s2, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
+; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
+; GFX10-PAL-NEXT: s_add_u32 s0, 4, s0
+; GFX10-PAL-NEXT: scratch_store_dword off, v0, off offset:4
+; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664
+; GFX10-PAL-NEXT: s_endpgm
bb:
%i = alloca [4096 x i32], align 4, addrspace(5)
%i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
@@ -1002,6 +1852,35 @@ define void @store_load_large_imm_offset_foo() {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32
+; GFX9-PAL-NEXT: s_add_u32 s0, s32, s0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
+; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
+; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: s_add_u32 s0, s32, s0
+; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32
+; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
+; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
%i = alloca [4096 x i32], align 4, addrspace(5)
%i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
@@ -1042,6 +1921,44 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024
; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024
; GFX10-NEXT: s_endpgm
+;
+; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX9-PAL-NEXT: s_mov_b32 s4, s0
+; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4
+; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
+; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0
+; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024
+; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024
+; GFX9-PAL-NEXT: s_endpgm
+;
+; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_getpc_b64 s[4:5]
+; GFX10-PAL-NEXT: s_mov_b32 s4, s0
+; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff
+; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3
+; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
+; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
+; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, s0, v0
+; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off offset:1024
+; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off offset:1024
+; GFX10-PAL-NEXT: s_endpgm
bb:
%alloca = alloca [32 x i32], align 4, addrspace(5)
%vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -1076,6 +1993,29 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_i64_aligned:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off
+; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: store_load_i64_aligned:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off
+; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, i64 addrspace(5)* %arg, align 8
%load = load volatile i64, i64 addrspace(5)* %arg, align 8
@@ -1105,6 +2045,29 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_i64_unaligned:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off
+; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: store_load_i64_unaligned:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off
+; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile i64 15, i64 addrspace(5)* %arg, align 1
%load = load volatile i64, i64 addrspace(5)* %arg, align 1
@@ -1136,6 +2099,31 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg)
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3
+; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off
+; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off
+; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
%load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
@@ -1169,6 +2157,33 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg)
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
+; GFX9-PAL: ; %bb.0: ; %bb
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2
+; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3
+; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4
+; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off
+; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off
+; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
+; GFX10-PAL: ; %bb.0: ; %bb
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1
+; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2
+; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3
+; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4
+; GFX10-PAL-NEXT: ; implicit-def: $vcc_hi
+; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off
+; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off
+; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
bb:
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
%load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
index ff8031bf9bda..af43d8a2c460 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -6,6 +6,8 @@
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR,GFX9_10-FLATSCR %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR-PAL %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR-PAL,GFX9_10-FLATSCR %s
; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0 0x0
; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1 0x0
@@ -25,6 +27,28 @@
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
+; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
+; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
+; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-FLATSCR-PAL-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
+
+; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
+; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
+; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
+; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
+; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+
; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
; MUBUF-DAG: s_mov_b32 s2, -1
@@ -44,6 +68,7 @@
; MUBUF-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
; MUBUF-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
; GFX10-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
+; GFX10-FLATSCR-PAL: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
; GCN-NOT: s_mov_b32 s0
; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
@@ -68,6 +93,27 @@ define amdgpu_ps float @ps_main(i32 %idx) {
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
+; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
+; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
+; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
+
+; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
+; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
+; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
+; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
+; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+
; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; GCN-NOT: s_mov_b32 s0
@@ -98,6 +144,27 @@ define amdgpu_vs float @vs_main(i32 %idx) {
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[2:3]
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s2, s0
+; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[2:3], s[2:3], 0x10
+; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
+; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-DAG: s_and_b32 s3, s3, 0xffff
+; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s2, s0
+; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s3, 0
+
+; GFX10-FLATSCR-PAL: s_getpc_b64 s[2:3]
+; GFX10-FLATSCR-PAL: s_mov_b32 s2, s0
+; GFX10-FLATSCR-PAL: s_load_dwordx2 s[2:3], s[2:3], 0x10
+; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL: s_and_b32 s3, s3, 0xffff
+; GFX10-FLATSCR-PAL: s_add_u32 s2, s2, s0
+; GFX10-FLATSCR-PAL: s_addc_u32 s3, s3, 0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+
; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; FLATSCR-NOT: SCRATCH_RSRC_DWORD
@@ -152,6 +219,27 @@ define amdgpu_hs float @hs_main(i32 %idx) {
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
+; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
@@ -184,6 +272,27 @@ define amdgpu_gs float @gs_main(i32 %idx) {
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
+; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; FLATSCR-NOT: SCRATCH_RSRC_DWORD
@@ -217,6 +326,27 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg,
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX9-FLATSCR-PAL-DAG: s_getpc_b64 s[0:1]
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 s0, s8
+; GFX9-FLATSCR-PAL-DAG: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-FLATSCR-PAL-DAG: v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-FLATSCR-PAL-DAG: v_mov_b32_e32 v0, 0xbf20e7f4
+; GFX9-FLATSCR-PAL-DAG: s_mov_b32 vcc_hi, 0
+; GFX9-FLATSCR-PAL-DAG: s_waitcnt lgkmcnt(0)
+; GFX9-FLATSCR-PAL-DAG: s_and_b32 s1, s1, 0xffff
+; GFX9-FLATSCR-PAL-DAG: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR-PAL-DAG: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR-PAL: s_getpc_b64 s[0:1]
+; GFX10-FLATSCR-PAL: s_mov_b32 s0, s8
+; GFX10-FLATSCR-PAL: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX10-FLATSCR-PAL: s_waitcnt lgkmcnt(0)
+; GFX10-FLATSCR-PAL: s_and_b32 s1, s1, 0xffff
+; GFX10-FLATSCR-PAL: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR-PAL: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; FLATSCR-NOT: SCRATCH_RSRC_DWORD
More information about the llvm-commits
mailing list