[llvm] 853bb19 - Revert "(Reland) [fastalloc] Support allocating specific register class in fastalloc"
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 15 05:34:56 PDT 2022
Author: Luo, Yuanke
Date: 2022-08-15T20:33:15+08:00
New Revision: 853bb192c407f5d9e75a5fd55cc089151530cbd3
URL: https://github.com/llvm/llvm-project/commit/853bb192c407f5d9e75a5fd55cc089151530cbd3
DIFF: https://github.com/llvm/llvm-project/commit/853bb192c407f5d9e75a5fd55cc089151530cbd3.diff
LOG: Revert "(Reland) [fastalloc] Support allocating specific register class in fastalloc"
This reverts commit 30f9e6ebd30b79d13f99eaca4d829e0da07186b3.
Added:
Modified:
llvm/lib/CodeGen/RegAllocFast.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 1f9f8056fdaef..9e4e26f1392ed 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -281,7 +281,6 @@ namespace {
Register traceCopies(Register VirtReg) const;
Register traceCopyChain(Register Reg) const;
- bool shouldAllocateRegister(const Register Reg) const;
int getStackSpaceFor(Register VirtReg);
void spill(MachineBasicBlock::iterator Before, Register VirtReg,
MCPhysReg AssignedReg, bool Kill, bool LiveOut);
@@ -301,12 +300,6 @@ char RegAllocFast::ID = 0;
INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
false)
-bool RegAllocFast::shouldAllocateRegister(const Register Reg) const {
- assert(Register::isVirtualRegister(Reg));
- const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
- return ShouldAllocateClass(*TRI, RC);
-}
-
void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI)
RegUnitStates[*UI] = NewState;
@@ -846,8 +839,6 @@ void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
assert(MO.isUndef() && "expected undef use");
Register VirtReg = MO.getReg();
assert(Register::isVirtualRegister(VirtReg) && "Expected virtreg");
- if (!shouldAllocateRegister(VirtReg))
- return;
LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
MCPhysReg PhysReg;
@@ -873,8 +864,6 @@ void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
/// (tied or earlyclobber) that may interfere with preassigned uses.
void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
Register VirtReg) {
- if (!shouldAllocateRegister(VirtReg))
- return;
LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
if (LRI != LiveVirtRegs.end()) {
MCPhysReg PrevReg = LRI->PhysReg;
@@ -908,8 +897,6 @@ void RegAllocFast::defineLiveThroughVirtReg(MachineInstr &MI, unsigned OpNum,
void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
Register VirtReg, bool LookAtPhysRegUses) {
assert(VirtReg.isVirtual() && "Not a virtual register");
- if (!shouldAllocateRegister(VirtReg))
- return;
MachineOperand &MO = MI.getOperand(OpNum);
LiveRegMap::iterator LRI;
bool New;
@@ -960,8 +947,6 @@ void RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum,
Register VirtReg) {
assert(VirtReg.isVirtual() && "Not a virtual register");
- if (!shouldAllocateRegister(VirtReg))
- return;
MachineOperand &MO = MI.getOperand(OpNum);
LiveRegMap::iterator LRI;
bool New;
@@ -986,13 +971,8 @@ void RegAllocFast::useVirtReg(MachineInstr &MI, unsigned OpNum,
Register Hint;
if (MI.isCopy() && MI.getOperand(1).getSubReg() == 0) {
Hint = MI.getOperand(0).getReg();
- if (Hint.isVirtual()) {
- assert(!shouldAllocateRegister(Hint));
- Hint = Register();
- } else {
- assert(Hint.isPhysical() &&
- "Copy destination should already be assigned");
- }
+ assert(Hint.isPhysical() &&
+ "Copy destination should already be assigned");
}
allocVirtReg(MI, *LRI, Hint, false);
if (LRI->Error) {
@@ -1100,8 +1080,6 @@ void RegAllocFast::addRegClassDefCounts(std::vector<unsigned> &RegClassDefCounts
assert(RegClassDefCounts.size() == TRI->getNumRegClasses());
if (Reg.isVirtual()) {
- if (!shouldAllocateRegister(Reg))
- return;
const TargetRegisterClass *OpRC = MRI->getRegClass(Reg);
for (unsigned RCIdx = 0, RCIdxEnd = TRI->getNumRegClasses();
RCIdx != RCIdxEnd; ++RCIdx) {
@@ -1161,8 +1139,6 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
if (MO.isReg()) {
Register Reg = MO.getReg();
if (Reg.isVirtual()) {
- if (!shouldAllocateRegister(Reg))
- continue;
if (MO.isDef()) {
HasDef = true;
HasVRegDef = true;
@@ -1226,7 +1202,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
}
if (MO.isDef()) {
- if (Reg.isVirtual() && shouldAllocateRegister(Reg))
+ if (Reg.isVirtual())
DefOperandIndexes.push_back(I);
addRegClassDefCounts(RegClassDefCounts, Reg);
@@ -1316,10 +1292,6 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
Register Reg = MO.getReg();
if (!Reg)
continue;
- if (Reg.isVirtual()) {
- assert(!shouldAllocateRegister(Reg));
- continue;
- }
assert(Reg.isPhysical());
if (MRI->isReserved(Reg))
continue;
@@ -1366,7 +1338,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
if (!MO.isReg() || !MO.isUse())
continue;
Register Reg = MO.getReg();
- if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
+ if (!Reg.isVirtual())
continue;
if (MO.isUndef()) {
@@ -1393,7 +1365,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
if (!MO.isReg() || !MO.isUse())
continue;
Register Reg = MO.getReg();
- if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
+ if (!Reg.isVirtual())
continue;
assert(MO.isUndef() && "Should only have undef virtreg uses left");
@@ -1416,10 +1388,6 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
Register Reg = MO.getReg();
if (!Reg)
continue;
- if (Reg.isVirtual()) {
- assert(!shouldAllocateRegister(Reg));
- continue;
- }
assert(Reg.isPhysical() && "should have register assigned");
// We sometimes get odd situations like:
@@ -1449,8 +1417,6 @@ void RegAllocFast::handleDebugValue(MachineInstr &MI) {
for (Register Reg : MI.getUsedDebugRegs()) {
if (!Register::isVirtualRegister(Reg))
continue;
- if (!shouldAllocateRegister(Reg))
- continue;
// Already spilled to a stackslot?
int SS = StackSlotForVirtReg[Reg];
@@ -1491,7 +1457,7 @@ void RegAllocFast::handleBundle(MachineInstr &MI) {
continue;
Register Reg = MO.getReg();
- if (!Reg.isVirtual() || !shouldAllocateRegister(Reg))
+ if (!Reg.isVirtual())
continue;
DenseMap<Register, MCPhysReg>::iterator DI;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index a5bfd4a88afca..a6be8956dbcd7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -8,50 +8,51 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
; CHECK-NEXT: s_or_saveexec_b32 s4, -1
-; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b32 exec_lo, s4
-; CHECK-NEXT: v_mov_b32_e32 v15, v1
-; CHECK-NEXT: v_mov_b32_e32 v14, v2
-; CHECK-NEXT: v_mov_b32_e32 v13, v3
-; CHECK-NEXT: v_mov_b32_e32 v12, v4
-; CHECK-NEXT: v_mov_b32_e32 v11, v5
-; CHECK-NEXT: v_mov_b32_e32 v10, v6
-; CHECK-NEXT: v_mov_b32_e32 v9, v7
+; CHECK-NEXT: v_mov_b32_e32 v14, v1
+; CHECK-NEXT: v_mov_b32_e32 v13, v2
+; CHECK-NEXT: v_mov_b32_e32 v12, v3
+; CHECK-NEXT: v_mov_b32_e32 v11, v4
+; CHECK-NEXT: v_mov_b32_e32 v10, v5
+; CHECK-NEXT: v_mov_b32_e32 v9, v6
+; CHECK-NEXT: v_mov_b32_e32 v8, v7
; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
-; CHECK-NEXT: v_mov_b32_e32 v1, v15
-; CHECK-NEXT: v_mov_b32_e32 v2, v14
-; CHECK-NEXT: v_mov_b32_e32 v3, v13
-; CHECK-NEXT: v_mov_b32_e32 v4, v12
-; CHECK-NEXT: v_mov_b32_e32 v5, v11
-; CHECK-NEXT: v_mov_b32_e32 v6, v10
-; CHECK-NEXT: v_mov_b32_e32 v7, v9
-; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; CHECK-NEXT: v_mov_b32_e32 v1, v14
+; CHECK-NEXT: v_mov_b32_e32 v2, v13
+; CHECK-NEXT: v_mov_b32_e32 v3, v12
+; CHECK-NEXT: v_mov_b32_e32 v4, v11
+; CHECK-NEXT: v_mov_b32_e32 v5, v10
+; CHECK-NEXT: v_mov_b32_e32 v6, v9
+; CHECK-NEXT: v_mov_b32_e32 v7, v8
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b32 s8, 0
; CHECK-NEXT: s_mov_b32 s4, s8
; CHECK-NEXT: s_mov_b32 s5, s8
; CHECK-NEXT: s_mov_b32 s6, s8
; CHECK-NEXT: s_mov_b32 s7, s8
-; CHECK-NEXT: v_writelane_b32 v8, s4, 0
-; CHECK-NEXT: v_writelane_b32 v8, s5, 1
-; CHECK-NEXT: v_writelane_b32 v8, s6, 2
-; CHECK-NEXT: v_writelane_b32 v8, s7, 3
+; CHECK-NEXT: v_writelane_b32 v16, s4, 0
+; CHECK-NEXT: v_writelane_b32 v16, s5, 1
+; CHECK-NEXT: v_writelane_b32 v16, s6, 2
+; CHECK-NEXT: v_writelane_b32 v16, s7, 3
; CHECK-NEXT: s_mov_b32 s6, 0
; CHECK-NEXT: s_mov_b32 s4, s6
; CHECK-NEXT: s_mov_b32 s5, s6
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
-; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b32 s4, exec_lo
-; CHECK-NEXT: v_writelane_b32 v8, s4, 4
+; CHECK-NEXT: v_writelane_b32 v16, s4, 4
; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
@@ -59,16 +60,15 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v7, v9
-; CHECK-NEXT: v_mov_b32_e32 v6, v10
-; CHECK-NEXT: v_mov_b32_e32 v5, v11
-; CHECK-NEXT: v_mov_b32_e32 v4, v12
-; CHECK-NEXT: v_mov_b32_e32 v3, v13
-; CHECK-NEXT: v_mov_b32_e32 v2, v14
-; CHECK-NEXT: v_mov_b32_e32 v1, v15
-; CHECK-NEXT: v_mov_b32_e32 v0, v16
+; CHECK-NEXT: v_mov_b32_e32 v7, v8
+; CHECK-NEXT: v_mov_b32_e32 v6, v9
+; CHECK-NEXT: v_mov_b32_e32 v5, v10
+; CHECK-NEXT: v_mov_b32_e32 v4, v11
+; CHECK-NEXT: v_mov_b32_e32 v3, v12
+; CHECK-NEXT: v_mov_b32_e32 v2, v13
+; CHECK-NEXT: v_mov_b32_e32 v1, v14
+; CHECK-NEXT: v_mov_b32_e32 v0, v15
; CHECK-NEXT: v_readfirstlane_b32 s12, v7
; CHECK-NEXT: v_readfirstlane_b32 s10, v6
; CHECK-NEXT: v_readfirstlane_b32 s9, v5
@@ -85,22 +85,22 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: s_mov_b32 s17, s6
; CHECK-NEXT: s_mov_b32 s18, s5
; CHECK-NEXT: s_mov_b32 s19, s4
-; CHECK-NEXT: v_writelane_b32 v8, s12, 5
-; CHECK-NEXT: v_writelane_b32 v8, s13, 6
-; CHECK-NEXT: v_writelane_b32 v8, s14, 7
-; CHECK-NEXT: v_writelane_b32 v8, s15, 8
-; CHECK-NEXT: v_writelane_b32 v8, s16, 9
-; CHECK-NEXT: v_writelane_b32 v8, s17, 10
-; CHECK-NEXT: v_writelane_b32 v8, s18, 11
-; CHECK-NEXT: v_writelane_b32 v8, s19, 12
-; CHECK-NEXT: v_mov_b32_e32 v6, v9
-; CHECK-NEXT: v_mov_b32_e32 v7, v10
-; CHECK-NEXT: v_mov_b32_e32 v4, v11
-; CHECK-NEXT: v_mov_b32_e32 v5, v12
-; CHECK-NEXT: v_mov_b32_e32 v2, v13
-; CHECK-NEXT: v_mov_b32_e32 v3, v14
-; CHECK-NEXT: v_mov_b32_e32 v0, v15
-; CHECK-NEXT: v_mov_b32_e32 v1, v16
+; CHECK-NEXT: v_writelane_b32 v16, s12, 5
+; CHECK-NEXT: v_writelane_b32 v16, s13, 6
+; CHECK-NEXT: v_writelane_b32 v16, s14, 7
+; CHECK-NEXT: v_writelane_b32 v16, s15, 8
+; CHECK-NEXT: v_writelane_b32 v16, s16, 9
+; CHECK-NEXT: v_writelane_b32 v16, s17, 10
+; CHECK-NEXT: v_writelane_b32 v16, s18, 11
+; CHECK-NEXT: v_writelane_b32 v16, s19, 12
+; CHECK-NEXT: v_mov_b32_e32 v6, v8
+; CHECK-NEXT: v_mov_b32_e32 v7, v9
+; CHECK-NEXT: v_mov_b32_e32 v4, v10
+; CHECK-NEXT: v_mov_b32_e32 v5, v11
+; CHECK-NEXT: v_mov_b32_e32 v2, v12
+; CHECK-NEXT: v_mov_b32_e32 v3, v13
+; CHECK-NEXT: v_mov_b32_e32 v0, v14
+; CHECK-NEXT: v_mov_b32_e32 v1, v15
; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13]
; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15]
; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17]
@@ -113,40 +113,40 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1]
; CHECK-NEXT: s_and_b32 s4, s4, s5
; CHECK-NEXT: s_and_saveexec_b32 s4, s4
-; CHECK-NEXT: v_writelane_b32 v8, s4, 13
+; CHECK-NEXT: v_writelane_b32 v16, s4, 13
; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT: v_readlane_b32 s4, v8, 13
-; CHECK-NEXT: v_readlane_b32 s8, v8, 5
-; CHECK-NEXT: v_readlane_b32 s9, v8, 6
-; CHECK-NEXT: v_readlane_b32 s10, v8, 7
-; CHECK-NEXT: v_readlane_b32 s11, v8, 8
-; CHECK-NEXT: v_readlane_b32 s12, v8, 9
-; CHECK-NEXT: v_readlane_b32 s13, v8, 10
-; CHECK-NEXT: v_readlane_b32 s14, v8, 11
-; CHECK-NEXT: v_readlane_b32 s15, v8, 12
-; CHECK-NEXT: v_readlane_b32 s16, v8, 0
-; CHECK-NEXT: v_readlane_b32 s17, v8, 1
-; CHECK-NEXT: v_readlane_b32 s18, v8, 2
-; CHECK-NEXT: v_readlane_b32 s19, v8, 3
+; CHECK-NEXT: v_readlane_b32 s4, v16, 13
+; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: v_readlane_b32 s8, v16, 5
+; CHECK-NEXT: v_readlane_b32 s9, v16, 6
+; CHECK-NEXT: v_readlane_b32 s10, v16, 7
+; CHECK-NEXT: v_readlane_b32 s11, v16, 8
+; CHECK-NEXT: v_readlane_b32 s12, v16, 9
+; CHECK-NEXT: v_readlane_b32 s13, v16, 10
+; CHECK-NEXT: v_readlane_b32 s14, v16, 11
+; CHECK-NEXT: v_readlane_b32 s15, v16, 12
+; CHECK-NEXT: v_readlane_b32 s16, v16, 0
+; CHECK-NEXT: v_readlane_b32 s17, v16, 1
+; CHECK-NEXT: v_readlane_b32 s18, v16, 2
+; CHECK-NEXT: v_readlane_b32 s19, v16, 3
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4
; CHECK-NEXT: s_cbranch_execnz .LBB0_1
; CHECK-NEXT: ; %bb.3:
-; CHECK-NEXT: v_readlane_b32 s4, v8, 4
+; CHECK-NEXT: v_readlane_b32 s4, v16, 4
; CHECK-NEXT: s_mov_b32 exec_lo, s4
; CHECK-NEXT: ; %bb.4:
-; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; CHECK-NEXT: ; implicit-def: $sgpr4
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: v_mov_b32_e32 v2, s4
; CHECK-NEXT: v_mov_b32_e32 v3, s4
; CHECK-NEXT: s_or_saveexec_b32 s4, -1
-; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b32 exec_lo, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index 5d0931d85f92e..5077ddf894c31 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -32,39 +32,39 @@ define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind
; GCN_DBG: ; %bb.0: ; %entry
; GCN_DBG-NEXT: s_load_dword s2, s[0:1], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_writelane_b32 v0, s2, 0
+; GCN_DBG-NEXT: v_writelane_b32 v2, s2, 0
; GCN_DBG-NEXT: s_load_dword s1, s[0:1], 0xa
; GCN_DBG-NEXT: s_mov_b32 s0, 0
; GCN_DBG-NEXT: s_mov_b32 s2, -1
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2
; GCN_DBG-NEXT: ; %bb.1: ; %for.exit
; GCN_DBG-NEXT: s_endpgm
; GCN_DBG-NEXT: .LBB0_2: ; %for.body
; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0
; GCN_DBG-NEXT: s_mov_b32 s1, 2
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_read_b32 v1, v1
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_read_b32 v0, v0
; GCN_DBG-NEXT: s_mov_b32 s2, 1.0
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_write_b32 v1, v2
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_write_b32 v0, v1
; GCN_DBG-NEXT: s_mov_b32 s1, 1
; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1
; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2
; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock
; GCN_DBG-NEXT: s_endpgm
@@ -107,35 +107,35 @@ define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nou
; GCN_DBG: ; %bb.0: ; %entry
; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0
; GCN_DBG-NEXT: s_mov_b32 s0, 0
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
; GCN_DBG-NEXT: s_branch .LBB1_2
; GCN_DBG-NEXT: .LBB1_1: ; %for.exit
; GCN_DBG-NEXT: s_endpgm
; GCN_DBG-NEXT: .LBB1_2: ; %for.body
; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0
; GCN_DBG-NEXT: s_mov_b32 s1, 2
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_read_b32 v1, v1
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_read_b32 v0, v0
; GCN_DBG-NEXT: s_mov_b32 s2, 1.0
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_write_b32 v1, v2
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_write_b32 v0, v1
; GCN_DBG-NEXT: s_mov_b32 s1, 1
; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0
; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1
; GCN_DBG-NEXT: s_branch .LBB1_2
entry:
@@ -172,35 +172,35 @@ define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) no
; GCN_DBG: ; %bb.0: ; %entry
; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0
; GCN_DBG-NEXT: s_mov_b32 s0, 0
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
; GCN_DBG-NEXT: s_branch .LBB2_2
; GCN_DBG-NEXT: .LBB2_1: ; %for.exit
; GCN_DBG-NEXT: s_endpgm
; GCN_DBG-NEXT: .LBB2_2: ; %for.body
; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0
; GCN_DBG-NEXT: s_mov_b32 s1, 2
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_read_b32 v1, v1
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_read_b32 v0, v0
; GCN_DBG-NEXT: s_mov_b32 s2, 1.0
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_write_b32 v1, v2
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_write_b32 v0, v1
; GCN_DBG-NEXT: s_mov_b32 s1, 1
; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1
; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1
; GCN_DBG-NEXT: s_branch .LBB2_2
entry:
@@ -238,33 +238,33 @@ define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) no
; GCN_DBG: ; %bb.0: ; %entry
; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0
; GCN_DBG-NEXT: s_mov_b32 s0, 0
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
; GCN_DBG-NEXT: s_branch .LBB3_2
; GCN_DBG-NEXT: .LBB3_1: ; %for.exit
; GCN_DBG-NEXT: s_endpgm
; GCN_DBG-NEXT: .LBB3_2: ; %for.body
; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1
-; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0
+; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1
+; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0
; GCN_DBG-NEXT: s_mov_b32 s1, 2
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_read_b32 v1, v1
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_read_b32 v0, v0
; GCN_DBG-NEXT: s_mov_b32 s2, 1.0
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s2
+; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s2
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_write_b32 v1, v2
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_write_b32 v0, v1
; GCN_DBG-NEXT: s_mov_b32 s1, 1
; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1
; GCN_DBG-NEXT: s_branch .LBB3_2
entry:
@@ -316,48 +316,48 @@ define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n) nounwind
; GCN_DBG: ; %bb.0: ; %entry
; GCN_DBG-NEXT: s_load_dword s0, s[0:1], 0x9
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, 0
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, 0
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: ds_read_u8 v1, v1
+; GCN_DBG-NEXT: ds_read_u8 v0, v0
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v1
+; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0
; GCN_DBG-NEXT: s_and_b32 s0, 1, s0
; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1
; GCN_DBG-NEXT: s_cselect_b64 s[0:1], -1, 0
; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1
; GCN_DBG-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1
-; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
+; GCN_DBG-NEXT: v_writelane_b32 v2, s1, 2
; GCN_DBG-NEXT: s_mov_b32 s0, 0
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3
; GCN_DBG-NEXT: s_branch .LBB4_2
; GCN_DBG-NEXT: .LBB4_1: ; %for.exit
; GCN_DBG-NEXT: s_endpgm
; GCN_DBG-NEXT: .LBB4_2: ; %for.body
; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3
-; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1
-; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2
-; GCN_DBG-NEXT: v_readlane_b32 s4, v0, 0
+; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 3
+; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 1
+; GCN_DBG-NEXT: v_readlane_b32 s3, v2, 2
+; GCN_DBG-NEXT: v_readlane_b32 s4, v2, 0
; GCN_DBG-NEXT: s_mov_b32 s1, 2
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
; GCN_DBG-NEXT: s_add_i32 s1, s1, s4
; GCN_DBG-NEXT: s_mov_b32 s4, 0x80
; GCN_DBG-NEXT: s_add_i32 s1, s1, s4
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_read_b32 v1, v1
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_read_b32 v0, v0
; GCN_DBG-NEXT: s_mov_b32 s4, 1.0
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
-; GCN_DBG-NEXT: v_add_f32_e64 v2, v1, s4
+; GCN_DBG-NEXT: v_add_f32_e64 v1, v0, s4
; GCN_DBG-NEXT: s_mov_b32 m0, -1
-; GCN_DBG-NEXT: v_mov_b32_e32 v1, s1
-; GCN_DBG-NEXT: ds_write_b32 v1, v2
+; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
+; GCN_DBG-NEXT: ds_write_b32 v0, v1
; GCN_DBG-NEXT: s_mov_b32 s1, 1
; GCN_DBG-NEXT: s_add_i32 s0, s0, s1
; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
-; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3
+; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3
; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1
; GCN_DBG-NEXT: s_branch .LBB4_2
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index f5bf963cd2bc3..f81c46ee2439b 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -420,11 +420,11 @@ bb.end: ; preds = %bb.then, %bb
; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0:[0-9]+]]
; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1:[0-9]+]]
; GCN-O0: [[INNER_LOOP:.LBB[0-9]+_[0-9]+]]:
-; GCN-O0: buffer_load_dword
; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_0]]
; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_BACK_EDGE_EXEC_SPILL_LANE_1]]
; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_0]]
; GCN-O0-DAG: v_readlane_b32 s{{[0-9]+}}, [[VGPR]], [[INNER_LOOP_IN_EXEC_SPILL_LANE_1]]
+; GCN-O0: buffer_load_dword
; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_0:[0-9]+]]
; GCN-O0-DAG: v_writelane_b32 [[VGPR]], s{{[0-9]+}}, [[OUTER_LOOP_EXEC_SPILL_LANE_1:[0-9]+]]
; GCN-O0: s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 3548e301aee0f..7891cded195d5 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -48,9 +48,6 @@
; VMEM: [[ENDIF]]:
-; Restore val
-; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
-
; Reload and restore exec mask
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -62,6 +59,9 @@
; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
+; Restore val
+; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
entry:
@@ -121,7 +121,6 @@ endif:
; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
; GCN: [[END]]:
-; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -131,6 +130,7 @@ endif:
; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
+; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
@@ -187,7 +187,6 @@ end:
; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
; GCN: [[FLOW]]: ; %Flow
-; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -199,6 +198,7 @@ end:
; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]]
; Regular spill value restored after exec modification
+; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
; Followed by spill
; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
@@ -230,7 +230,6 @@ end:
; GCN-NEXT: s_branch [[FLOW]]
; GCN: [[ENDIF]]:
-; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]
; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
@@ -242,6 +241,7 @@ end:
; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
+; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
index bee7e80a5a7ba..1944f813f74e9 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll
@@ -13,7 +13,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
; GCN-NEXT: successors: %bb.1(0x80000000)
; GCN-NEXT: liveins: $vgpr0, $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
+ ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed renamable $sgpr0_sgpr1, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset.cast, align 4, addrspace 4)
; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1
; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
@@ -23,7 +23,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
; GCN-NEXT: renamable $sgpr1 = COPY killed renamable $sgpr6
; GCN-NEXT: renamable $sgpr2 = COPY killed renamable $sgpr5
; GCN-NEXT: renamable $sgpr3 = COPY killed renamable $sgpr4
- ; GCN-NEXT: SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.1, align 4, addrspace 5)
+ ; GCN-NEXT: SI_SPILL_S128_SAVE killed $sgpr0_sgpr1_sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s128) into %stack.2, align 4, addrspace 5)
; GCN-NEXT: renamable $sgpr0 = S_MOV_B32 16
; GCN-NEXT: renamable $sgpr1 = S_MOV_B32 15
; GCN-NEXT: renamable $sgpr2 = S_MOV_B32 14
@@ -40,55 +40,59 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
; GCN-NEXT: renamable $sgpr13 = S_MOV_B32 2
; GCN-NEXT: renamable $sgpr14 = S_MOV_B32 1
; GCN-NEXT: renamable $sgpr15 = S_MOV_B32 0
- ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr15
- ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr14
- ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr13
- ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr12
- ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr11
- ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr10
- ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr9
- ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr8
- ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr7
- ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr6
- ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr5
- ; GCN-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr4
- ; GCN-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr3
- ; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr2
- ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr1
- ; GCN-NEXT: [[COPY16:%[0-9]+]]:vgpr_32 = COPY killed renamable $sgpr0
- ; GCN-NEXT: undef %28.sub0:vreg_512 = COPY [[COPY1]]
- ; GCN-NEXT: %28.sub1:vreg_512 = COPY [[COPY2]]
- ; GCN-NEXT: %28.sub2:vreg_512 = COPY [[COPY3]]
- ; GCN-NEXT: %28.sub3:vreg_512 = COPY [[COPY4]]
- ; GCN-NEXT: %28.sub4:vreg_512 = COPY [[COPY5]]
- ; GCN-NEXT: %28.sub5:vreg_512 = COPY [[COPY6]]
- ; GCN-NEXT: %28.sub6:vreg_512 = COPY [[COPY7]]
- ; GCN-NEXT: %28.sub7:vreg_512 = COPY [[COPY8]]
- ; GCN-NEXT: %28.sub8:vreg_512 = COPY [[COPY9]]
- ; GCN-NEXT: %28.sub9:vreg_512 = COPY [[COPY10]]
- ; GCN-NEXT: %28.sub10:vreg_512 = COPY [[COPY11]]
- ; GCN-NEXT: %28.sub11:vreg_512 = COPY [[COPY12]]
- ; GCN-NEXT: %28.sub12:vreg_512 = COPY [[COPY13]]
- ; GCN-NEXT: %28.sub13:vreg_512 = COPY [[COPY14]]
- ; GCN-NEXT: %28.sub14:vreg_512 = COPY [[COPY15]]
- ; GCN-NEXT: %28.sub15:vreg_512 = COPY [[COPY16]]
+ ; GCN-NEXT: renamable $vgpr0 = COPY killed renamable $sgpr15
+ ; GCN-NEXT: renamable $vgpr30 = COPY killed renamable $sgpr14
+ ; GCN-NEXT: renamable $vgpr29 = COPY killed renamable $sgpr13
+ ; GCN-NEXT: renamable $vgpr28 = COPY killed renamable $sgpr12
+ ; GCN-NEXT: renamable $vgpr27 = COPY killed renamable $sgpr11
+ ; GCN-NEXT: renamable $vgpr26 = COPY killed renamable $sgpr10
+ ; GCN-NEXT: renamable $vgpr25 = COPY killed renamable $sgpr9
+ ; GCN-NEXT: renamable $vgpr24 = COPY killed renamable $sgpr8
+ ; GCN-NEXT: renamable $vgpr23 = COPY killed renamable $sgpr7
+ ; GCN-NEXT: renamable $vgpr22 = COPY killed renamable $sgpr6
+ ; GCN-NEXT: renamable $vgpr21 = COPY killed renamable $sgpr5
+ ; GCN-NEXT: renamable $vgpr20 = COPY killed renamable $sgpr4
+ ; GCN-NEXT: renamable $vgpr19 = COPY killed renamable $sgpr3
+ ; GCN-NEXT: renamable $vgpr18 = COPY killed renamable $sgpr2
+ ; GCN-NEXT: renamable $vgpr17 = COPY killed renamable $sgpr1
+ ; GCN-NEXT: renamable $vgpr16 = COPY killed renamable $sgpr0
+ ; GCN-NEXT: undef renamable $vgpr0 = COPY killed renamable $vgpr0, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+ ; GCN-NEXT: renamable $vgpr1 = COPY killed renamable $vgpr30
+ ; GCN-NEXT: renamable $vgpr2 = COPY killed renamable $vgpr29
+ ; GCN-NEXT: renamable $vgpr3 = COPY killed renamable $vgpr28
+ ; GCN-NEXT: renamable $vgpr4 = COPY killed renamable $vgpr27
+ ; GCN-NEXT: renamable $vgpr5 = COPY killed renamable $vgpr26
+ ; GCN-NEXT: renamable $vgpr6 = COPY killed renamable $vgpr25
+ ; GCN-NEXT: renamable $vgpr7 = COPY killed renamable $vgpr24
+ ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr23
+ ; GCN-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr22
+ ; GCN-NEXT: renamable $vgpr10 = COPY killed renamable $vgpr21
+ ; GCN-NEXT: renamable $vgpr11 = COPY killed renamable $vgpr20
+ ; GCN-NEXT: renamable $vgpr12 = COPY killed renamable $vgpr19
+ ; GCN-NEXT: renamable $vgpr13 = COPY killed renamable $vgpr18
+ ; GCN-NEXT: renamable $vgpr14 = COPY killed renamable $vgpr17
+ ; GCN-NEXT: renamable $vgpr15 = COPY killed renamable $vgpr16
+ ; GCN-NEXT: SI_SPILL_V512_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, %stack.1, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.1, align 4, addrspace 5)
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_MOV_B64 $exec
; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
- ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
; GCN-NEXT: renamable $sgpr0_sgpr1 = IMPLICIT_DEF
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.1:
; GCN-NEXT: successors: %bb.1(0x40000000), %bb.3(0x40000000)
; GCN-NEXT: {{ $}}
- ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.2, align 4, addrspace 5)
- ; GCN-NEXT: dead %45:vgpr_32 = COPY [[DEF]]
- ; GCN-NEXT: renamable $sgpr2 = V_READFIRSTLANE_B32 [[COPY]](s32), implicit $exec
- ; GCN-NEXT: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, [[COPY]](s32), implicit $exec
+ ; GCN-NEXT: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.4, implicit $exec, implicit $sgpr32 :: (load (s64) from %stack.4, align 4, addrspace 5)
+ ; GCN-NEXT: $vgpr17 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+ ; GCN-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = SI_SPILL_V512_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.1, align 4, addrspace 5)
+ ; GCN-NEXT: $vgpr16 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+ ; GCN-NEXT: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr16, implicit $exec
+ ; GCN-NEXT: renamable $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 $sgpr2, $vgpr16, implicit $exec
; GCN-NEXT: renamable $sgpr0_sgpr1 = S_AND_SAVEEXEC_B64 killed renamable $sgpr0_sgpr1, implicit-def $exec, implicit-def dead $scc, implicit $exec
- ; GCN-NEXT: [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_:%[0-9]+]]:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 %28, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec
- ; GCN-NEXT: [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]]
+ ; GCN-NEXT: renamable $vgpr0 = V_INDIRECT_REG_READ_GPR_IDX_B32_V16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $sgpr2, 11, implicit-def $m0, implicit $m0, implicit $exec
+ ; GCN-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+ ; GCN-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY renamable $sgpr0_sgpr1
- ; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: SI_SPILL_S64_SAVE killed $sgpr2_sgpr3, %stack.4, implicit $exec, implicit $sgpr32 :: (store (s64) into %stack.4, align 4, addrspace 5)
; GCN-NEXT: $exec = S_XOR_B64_term $exec, killed renamable $sgpr0_sgpr1, implicit-def dead $scc
; GCN-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec
; GCN-NEXT: {{ $}}
@@ -99,8 +103,9 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
; GCN-NEXT: $exec = S_MOV_B64 renamable $sgpr0_sgpr1
; GCN-NEXT: {{ $}}
; GCN-NEXT: bb.2:
- ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.1, align 4, addrspace 5)
- ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET [[V_INDIRECT_REG_READ_GPR_IDX_B32_V16_]], killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
+ ; GCN-NEXT: $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+ ; GCN-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sgpr32 :: (load (s128) from %stack.2, align 4, addrspace 5)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.out.load, addrspace 1)
; GCN-NEXT: S_ENDPGM 0
entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index 3d597b998a655..b8bb3a5a242a1 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -227,14 +227,14 @@ entry:
; W64-O0-DAG: s_mov_b32 [[IDX_S:s[0-9]+]], s{{[0-9]+}}
; W64-O0-DAG: v_mov_b32_e32 [[IDX_V:v[0-9]+]], s{{[0-9]+}}
-; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
; W64-O0-DAG: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
+; W64-O0-DAG: buffer_store_dword [[IDX_V]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill
; W64-O0: [[LOOPBB0:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1
-; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; W64-O0: s_waitcnt vmcnt(0)
; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]]
; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
@@ -251,7 +251,7 @@ entry:
; W64-O0-DAG: s_mov_b32 s[[S2:[0-9]+]], s[[SRSRCTMP2]]
; W64-O0-DAG: s_mov_b32 s[[S3:[0-9]+]], s[[SRSRCTMP3]]
; W64-O0: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
-; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword [[IDX:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 ; 4-byte Folded Reload
; W64-O0: buffer_load_format_x [[RES:v[0-9]+]], [[IDX]], s[[[S0]]:[[S3]]], {{.*}} idxen
; W64-O0: s_waitcnt vmcnt(0)
; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP:[0-9]+]] ; 4-byte Folded Spill
@@ -270,10 +270,10 @@ entry:
; W64-O0: v_writelane_b32 [[VSAVEEXEC]], s[[SAVEEXEC1]], [[SAVEEXEC_IDX1:[0-9]+]]
; W64-O0: [[LOOPBB1:.LBB[0-9]+_[0-9]+]]: ; =>This Inner Loop Header: Depth=1
-; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC0:[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC1:[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC2:[0-9]+]], off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0: buffer_load_dword v[[VRSRC3:[0-9]+]], off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; W64-O0: s_waitcnt vmcnt(0)
; W64-O0-DAG: v_readfirstlane_b32 s[[S0:[0-9]+]], v[[VRSRC0]]
; W64-O0-DAG: v_readfirstlane_b32 s[[SRSRCTMP1:[0-9]+]], v[[VRSRC1]]
@@ -297,10 +297,10 @@ entry:
; W64-O0: s_xor_b64 exec, exec, [[SAVE]]
; W64-O0-NEXT: s_cbranch_execnz [[LOOPBB1]]
-; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
; W64-O0: v_readlane_b32 s[[SAVEEXEC0:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX0]]
; W64-O0: v_readlane_b32 s[[SAVEEXEC1:[0-9]+]], [[VSAVEEXEC]], [[SAVEEXEC_IDX1]]
; W64-O0: s_mov_b64 exec, s[[[SAVEEXEC0]]:[[SAVEEXEC1]]]
+; W64-O0: buffer_load_dword [[RES:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF_TMP]] ; 4-byte Folded Reload
; W64-O0: buffer_store_dword [[RES]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:[[RES_OFF]] ; 4-byte Folded Spill
; W64-O0: [[TERMBB]]:
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index e00825b6f4f0b..1e424ecde23d5 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -19,10 +19,10 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: v_writelane_b32 v40, s33, 2
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_add_i32 s32, s32, 0x400
-; CHECK-NEXT: v_writelane_b32 v40, s30, 0
-; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: .Ltmp0:
; CHECK-NEXT: .loc 0 31 3 prologue_end ; lane-info.cpp:31:3
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, _ZL13sleep_foreverv at gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, _ZL13sleep_foreverv at gotpcrel32@hi+12
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 515253e6a43f2..b7d45756f9c37 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -191,23 +191,23 @@ define void @spill_sgpr_with_tail_call() #0 {
; we have no VGPR to allocate for SGPR spills. We are forced to spill to memory.
; GCN-LABEL: {{^}}spill_sgpr_no_free_vgpr:
-; GCN: v_writelane_b32 v{{[0-9]+}}, s34, 0
-; GCN: v_writelane_b32 v{{[0-9]+}}, s35, 1
-; GCN: v_writelane_b32 v{{[0-9]+}}, s36, 2
-; GCN: v_writelane_b32 v{{[0-9]+}}, s37, 3
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
+; GCN: v_writelane_b32 [[A:v[0-9]+]], s34, 0
+; GCN: buffer_store_dword [[A]], off, s[0:3], s32
+; GCN: v_writelane_b32 [[B:v[0-9]+]], s35, 0
+; GCN: buffer_store_dword [[B]], off, s[0:3], s32
+; GCN: v_writelane_b32 [[C:v[0-9]+]], s36, 0
+; GCN: buffer_store_dword [[C]], off, s[0:3], s32
+; GCN: v_writelane_b32 [[D:v[0-9]+]], s37, 0
+; GCN: buffer_store_dword [[D]], off, s[0:3], s32
; GCN: #ASMEND
-; GCN: buffer_load_dword v{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}
-; GCN: buffer_load_dword v{{[0-9]+}}
-; GCN: v_readlane_b32 s37, v{{[0-9]+}}, 3
-; GCN: v_readlane_b32 s36, v{{[0-9]+}}, 2
-; GCN: v_readlane_b32 s35, v{{[0-9]+}}, 1
-; GCN: v_readlane_b32 s34, v{{[0-9]+}}, 0
+; GCN: buffer_load_dword [[E:v[0-9]+]]
+; GCN: v_readlane_b32 s37, [[E]], 0
+; GCN: buffer_load_dword [[F:v[0-9]+]]
+; GCN: v_readlane_b32 s36, [[F]], 0
+; GCN: buffer_load_dword [[G:v[0-9]+]]
+; GCN: v_readlane_b32 s35, [[G]], 0
+; GCN: buffer_load_dword [[H:v[0-9]+]]
+; GCN: v_readlane_b32 s34, [[H]], 0
define void @spill_sgpr_no_free_vgpr(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
%a = load <4 x i32>, <4 x i32> addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 5d3db4ea38c66..29f8c60ad281b 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -133,7 +133,7 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
; GFX9-O0: ; %bb.0: ; %entry
; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -144,18 +144,18 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_mov_b32 s39, s7
; GFX9-O0-NEXT: s_mov_b64 s[42:43], s[38:39]
; GFX9-O0-NEXT: s_mov_b64 s[40:41], s[36:37]
-; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0
-; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1
-; GFX9-O0-NEXT: v_writelane_b32 v3, s42, 2
-; GFX9-O0-NEXT: v_writelane_b32 v3, s43, 3
+; GFX9-O0-NEXT: v_writelane_b32 v5, s40, 0
+; GFX9-O0-NEXT: v_writelane_b32 v5, s41, 1
+; GFX9-O0-NEXT: v_writelane_b32 v5, s42, 2
+; GFX9-O0-NEXT: v_writelane_b32 v5, s43, 3
; GFX9-O0-NEXT: s_mov_b32 s34, 0
-; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34
+; GFX9-O0-NEXT: buffer_load_dwordx2 v[3:4], off, s[36:39], s34
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
; GFX9-O0-NEXT: s_not_b64 exec, exec
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s34
; GFX9-O0-NEXT: s_not_b64 exec, exec
@@ -165,23 +165,23 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2
; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37]
-; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec
-; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 4
-; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 5
+; GFX9-O0-NEXT: v_writelane_b32 v5, s34, 4
+; GFX9-O0-NEXT: v_writelane_b32 v5, s35, 5
; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37]
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2
; GFX9-O0-NEXT: ; %bb.1: ; %if
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
@@ -194,20 +194,19 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1
-; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
; GFX9-O0-NEXT: .LBB1_2: ; %merge
-; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: s_nop 0
-; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 4
-; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 5
+; GFX9-O0-NEXT: v_readlane_b32 s34, v5, 4
+; GFX9-O0-NEXT: v_readlane_b32 s35, v5, 5
; GFX9-O0-NEXT: s_or_b64 exec, exec, s[34:35]
-; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 0
-; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 1
-; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 2
-; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 3
+; GFX9-O0-NEXT: v_readlane_b32 s36, v5, 0
+; GFX9-O0-NEXT: v_readlane_b32 s37, v5, 1
+; GFX9-O0-NEXT: v_readlane_b32 s38, v5, 2
+; GFX9-O0-NEXT: v_readlane_b32 s39, v5, 3
+; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v4
+; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[34:35], v0, v3
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[34:35]
; GFX9-O0-NEXT: s_mov_b32 s34, 1
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s34, v0
@@ -216,7 +215,7 @@ define amdgpu_gfx void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
; GFX9-O0-NEXT: s_mov_b32 s34, 0
; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4
; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35]
More information about the llvm-commits
mailing list