[llvm] 690f5b7 - [AMDGPU] Fix function calls with flat scratch

Sebastian Neubauer via llvm-commits llvm-commits at lists.llvm.org
Fri May 28 02:33:07 PDT 2021


Author: Sebastian Neubauer
Date: 2021-05-28T11:22:13+02:00
New Revision: 690f5b7a0128a210093e9b217932743ad35b5c5a

URL: https://github.com/llvm/llvm-project/commit/690f5b7a0128a210093e9b217932743ad35b5c5a
DIFF: https://github.com/llvm/llvm-project/commit/690f5b7a0128a210093e9b217932743ad35b5c5a.diff

LOG: [AMDGPU] Fix function calls with flat scratch

When flat scratch is used, the stack pointer needs to be added when
writing arguments to the stack.
For buffer instructions, this is done in SelectMUBUFScratchOffen
and SelectMUBUFScratchOffset.

Move that to call argument lowering, like it is done in GlobalISel.

Differential Revision: https://reviews.llvm.org/D103166

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
    llvm/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
    llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
    llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
    llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 24e35d3f3e88..e2ceaa223149 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1497,11 +1497,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   return false;
 }
 
-static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
-  auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
-  return PSV && PSV->isStack();
-}
-
 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
   SDLoc DL(N);
 
@@ -1538,13 +1533,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
       VAddr = SDValue(MovHighBits, 0);
 
-      // In a call sequence, stores to the argument stack area are relative to the
-      // stack pointer.
-      const MachinePointerInfo &PtrInfo
-        = cast<MemSDNode>(Parent)->getPointerInfo();
-      SOffset = isStackPtrRelative(PtrInfo)
-        ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
-        : CurDAG->getTargetConstant(0, DL, MVT::i32);
+      SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
       ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
       return true;
     }
@@ -1587,28 +1576,52 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
   return true;
 }
 
+static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
+  if (Val.getOpcode() != ISD::CopyFromReg)
+    return false;
+  auto RC =
+      TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg());
+  return RC && TRI.isSGPRClass(RC);
+}
+
 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
                                                   SDValue Addr,
                                                   SDValue &SRsrc,
                                                   SDValue &SOffset,
                                                   SDValue &Offset) const {
-  ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
-  if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
-    return false;
-
-  SDLoc DL(Addr);
+  const SIRegisterInfo *TRI =
+      static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
   MachineFunction &MF = CurDAG->getMachineFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  SDLoc DL(Addr);
 
-  SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+  // CopyFromReg <sgpr>
+  if (IsCopyFromSGPR(*TRI, Addr)) {
+    SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+    SOffset = Addr;
+    Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+    return true;
+  }
 
-  const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
+  ConstantSDNode *CAddr;
+  if (Addr.getOpcode() == ISD::ADD) {
+    // Add (CopyFromReg <sgpr>) <constant>
+    CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
+      return false;
+    if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
+      return false;
 
-  // FIXME: Get from MachinePointerInfo? We should only be using the frame
-  // offset if we know this is in a call sequence.
-  SOffset = isStackPtrRelative(PtrInfo)
-                ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
-                : CurDAG->getTargetConstant(0, DL, MVT::i32);
+    SOffset = Addr.getOperand(0);
+  } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
+             SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
+    // <constant>
+    SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  } else {
+    return false;
+  }
+
+  SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
 
   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
   return true;
@@ -1890,19 +1903,21 @@ static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
 }
 
 // Match (32-bit SGPR base) + sext(imm offset)
-bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
-                                            SDValue Addr,
+bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
                                             SDValue &SAddr,
                                             SDValue &Offset) const {
   if (Addr->isDivergent())
     return false;
 
-  SAddr = Addr;
+  SDLoc DL(Addr);
+
   int64_t COffsetVal = 0;
 
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
     SAddr = Addr.getOperand(0);
+  } else {
+    SAddr = Addr;
   }
 
   SAddr = SelectSAddrFI(CurDAG, SAddr);
@@ -1917,14 +1932,15 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
 
     COffsetVal = SplitImmOffset;
 
-    SDLoc DL(N);
     SDValue AddOffset =
-        getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+        SAddr.getOpcode() == ISD::TargetFrameIndex
+            ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
+            : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
                                            SAddr, AddOffset), 0);
   }
 
-  Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
+  Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
 
   return true;
 }

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8c5375b8c096..4e7a573551d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -4156,8 +4156,13 @@ SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
                                                    int64_t Offset) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
+  // Stores to the argument stack area are relative to the stack pointer.
+  SDValue SP =
+      DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
+  Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
                                MachineMemOperand::MODereferenceable);
   return Store;

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2fa7511dd0d1..2081f0f2b7f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3692,11 +3692,6 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
   }};
 }
 
-static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
-  auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
-  return PSV && PSV->isStack();
-}
-
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
   MachineInstr *MI = Root.getParent();
@@ -3818,18 +3813,13 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
 
   const MachineFunction *MF = MBB->getParent();
   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
-  const MachineMemOperand *MMO = *MI->memoperands_begin();
-  const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
 
   return {{
       [=](MachineInstrBuilder &MIB) { // rsrc
         MIB.addReg(Info->getScratchRSrcReg());
       },
       [=](MachineInstrBuilder &MIB) { // soffset
-        if (isStackPtrRelative(PtrInfo))
-          MIB.addReg(Info->getStackPtrOffsetReg());
-        else
-          MIB.addImm(0);
+        MIB.addImm(0);
       },
       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
   }};

diff  --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 65756f620b90..e7b1bd580ade 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3123,7 +3123,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
         // locations, which are supposed to be immutable?
         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
       } else {
-        DstAddr = PtrOff;
+        // Stores to the argument stack area are relative to the stack pointer.
+        SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
+                                        MVT::i32);
+        DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
         Alignment =
             commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index 807124bf6d6b..28b56d004f5d 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -494,11 +494,11 @@ define void @too_many_args_use_workitem_id_x_byval(
 
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN-DAG: s_movk_i32 s32, 0x400
 
 ; GCN: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
+; GCN: s_movk_i32 s32, 0x400
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
 
 ; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],

diff  --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index e44bf8f57cf8..760f062d2b68 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -609,10 +609,10 @@ define void @too_many_args_use_workitem_id_x_byval(
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
 ; VARABI: enable_vgpr_workitem_id = 0
 ; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; VARABI: s_movk_i32 s32, 0x400{{$}}
 ; VARABI: buffer_store_dword [[K]], off, s[0:3], 0 offset:4
-; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4
+; VARABI: s_movk_i32 s32, 0x400{{$}}
+; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
 
 ; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
 ; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
@@ -656,8 +656,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
 ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
 ; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
 ; VARABI: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
-; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}}
+; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
 ; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
 ; VARABI: s_swappc_b64

diff  --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 78237dc00227..651b5ef03350 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -160,14 +160,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i1 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i1 at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
@@ -257,7 +256,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
@@ -266,7 +264,7 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s2
+; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -356,7 +354,6 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[0:1], off glc dlc
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
@@ -365,7 +362,7 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s2
+; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -4184,7 +4181,6 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
@@ -4205,7 +4201,7 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_i32 at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(8)
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s2
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -4967,14 +4963,13 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 1
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_i1_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_i1_inreg at rel32@hi+12
-; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    scratch_store_byte off, v0, s32
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
@@ -8351,16 +8346,10 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 18
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 20
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 16
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, 12
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, 8
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 1
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 3
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s40, 4
@@ -8385,19 +8374,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s51
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s50
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s49
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s48
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s47
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s46
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s2
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s3
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v2, s20
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v3, s21
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v4, s22
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v5, s23
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
@@ -8408,6 +8390,8 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 17
@@ -8631,20 +8615,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 18
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 24
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, 20
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, 12
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 1
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s36, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s37, 1
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s38, 2
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s39, 3
-; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s40, 4
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s41, 5
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s42, 6
@@ -8657,42 +8633,39 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s49, 13
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s50, 14
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s51, 15
+; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    ; kill: killed $sgpr0_sgpr1
 ; GFX10-SCRATCH-NEXT:    ; kill: killed $sgpr0_sgpr1
+; GFX10-SCRATCH-NEXT:    s_clause 0x1
 ; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[36:51], s[0:1], 0x40
+; GFX10-SCRATCH-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x0
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_v32i32_i32_inreg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_v32i32_i32_inreg at rel32@hi+12
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 17
 ; GFX10-SCRATCH-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 8
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s50
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s49
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s3
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s51
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s48
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s47
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s46
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s20
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s21
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v2, s22
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v3, s2
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v4, s23
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v5, s24
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, s2
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, s50
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, s51
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, s46
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, s47
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, s48
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, s49
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s20, s36
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s21, s37
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s22, s38
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s23, s39
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s24, s40
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s25, s41
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s26, s42
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s27, s43
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s28, s44
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s29, s45
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v6, s32 offset:24
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 16
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 17
@@ -8804,21 +8777,15 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s2, 4
-; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v32, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v33, off, s33
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s3, 0
+; GFX10-SCRATCH-NEXT:    scratch_load_dwordx2 v[32:33], off, s33
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+12
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v32, s2
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v33, s3
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx2 off, v[32:33], s32
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 0
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s1, v40, 1
@@ -8977,26 +8944,20 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 15
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 12
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 14
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 12
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 13
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 8
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s0
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 4
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 12
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 13
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 14
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v2, s1
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 1
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 2
@@ -9024,8 +8985,6 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 9
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 10
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 11
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+12
@@ -9216,34 +9175,20 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 15
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 14
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 28
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 24
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 11
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 13
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 12
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 20
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 10
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 12
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 9
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v2, s0
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 8
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 8
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 12
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 13
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 14
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 8
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 9
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 10
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 11
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v2, s1
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s32
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
@@ -9276,8 +9221,6 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 5
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 6
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 7
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+12
@@ -9464,34 +9407,20 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41700000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41600000
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 28
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 24
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0x41300000
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41500000
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41400000
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 20
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s33, 2
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s1
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41200000
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 12
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41100000
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v2, s0
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 8
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0x41000000
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v0, s0
-; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, 4
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0x41500000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0x41600000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 0x41700000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v5, 0x41100000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v6, 0x41200000
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v7, 0x41300000
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
+; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v1, s0
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v2, s1
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
+; GFX10-SCRATCH-NEXT:    scratch_store_dwordx4 off, v[4:7], s32
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v2, 0
@@ -9524,8 +9453,6 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v29, 0x40a00000
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v30, 0x40c00000
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v31, 0x40e00000
-; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
-; GFX10-SCRATCH-NEXT:    s_add_u32 s32, s32, 16
 ; GFX10-SCRATCH-NEXT:    s_getpc_b64 s[0:1]
 ; GFX10-SCRATCH-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
 ; GFX10-SCRATCH-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+12

diff  --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index d3cf255309ef..df9904a4ad1a 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -66,16 +66,15 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_cbranch_scc1 BB0_3
 ; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
 ; FLATSCR-NEXT:    s_mov_b32 s2, s32
-; FLATSCR-NEXT:    s_movk_i32 s3, 0x1000
-; FLATSCR-NEXT:    s_add_i32 s4, s2, s3
+; FLATSCR-NEXT:    s_add_i32 s3, s2, 0x1000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; FLATSCR-NEXT:    s_add_u32 s2, s2, 0x1000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
-; FLATSCR-NEXT:    s_add_u32 s2, s2, s3
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
 ; FLATSCR-NEXT:    s_lshl_b32 s2, s6, 2
-; FLATSCR-NEXT:    s_mov_b32 s32, s4
-; FLATSCR-NEXT:    s_add_i32 s4, s4, s2
-; FLATSCR-NEXT:    scratch_load_dword v2, off, s4
+; FLATSCR-NEXT:    s_mov_b32 s32, s3
+; FLATSCR-NEXT:    s_add_i32 s3, s3, s2
+; FLATSCR-NEXT:    scratch_load_dword v2, off, s3
 ; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v0, v2, v0
@@ -255,7 +254,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
 ; FLATSCR:       ; %bb.0: ; %entry
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; FLATSCR-NEXT:    s_mov_b32 s5, s33
+; FLATSCR-NEXT:    s_mov_b32 s4, s33
 ; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; FLATSCR-NEXT:    s_mov_b32 s33, s32
 ; FLATSCR-NEXT:    s_add_u32 s32, s32, 16
@@ -267,16 +266,15 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; FLATSCR-NEXT:    s_cbranch_execz BB2_3
 ; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
 ; FLATSCR-NEXT:    s_mov_b32 s2, s32
-; FLATSCR-NEXT:    s_movk_i32 s3, 0x1000
-; FLATSCR-NEXT:    s_add_i32 s4, s2, s3
+; FLATSCR-NEXT:    s_add_i32 s3, s2, 0x1000
+; FLATSCR-NEXT:    s_add_u32 s2, s2, 0x1000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v3, 1
-; FLATSCR-NEXT:    s_add_u32 s2, s2, s3
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s2
-; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s4
+; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s3
 ; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
 ; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
-; FLATSCR-NEXT:    s_mov_b32 s32, s4
+; FLATSCR-NEXT:    s_mov_b32 s32, s3
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
@@ -286,7 +284,7 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_sub_u32 s32, s32, 16
-; FLATSCR-NEXT:    s_mov_b32 s33, s5
+; FLATSCR-NEXT:    s_mov_b32 s33, s4
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 
 entry:


        


More information about the llvm-commits mailing list