[llvm] r362665 - AMDGPU: Don't fix emergency stack slot at offset 0

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 5 15:37:50 PDT 2019


Author: arsenm
Date: Wed Jun  5 15:37:50 2019
New Revision: 362665

URL: http://llvm.org/viewvc/llvm-project?rev=362665&view=rev
Log:
AMDGPU: Don't fix emergency stack slot at offset 0

This forced the caller to be aware of this, which is an ugly ABI
feature.

Partially reverts r295877. The original reasons for doing this are
mostly fixed. Alloca is now in a non-0 address space, so it should be
OK to have 0 as a valid pointer. Since we treat the absolute address
as the pointer value, this part only really needed to apply to
kernels.

Since r357093, we avoid the need to increment/decrement the offset
register in more cases, and since r354816 the scavenger can fail
without spilling, so it's less critical that we try to avoid an offset
that fits in the MUBUF offset.

Restrict to callable functions for now to split this into 2 steps to
limit thte number of test updates and in case anything breaks.

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
    llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
    llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll
    llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
    llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
    llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
    llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll
    llvm/trunk/test/CodeGen/AMDGPU/function-args.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll
    llvm/trunk/test/CodeGen/AMDGPU/load-lo16.ll
    llvm/trunk/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
    llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll
    llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-offset-calculation.ll
    llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll
    llvm/trunk/test/CodeGen/AMDGPU/store-hi16.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp Wed Jun  5 15:37:50 2019
@@ -773,22 +773,17 @@ void SIFrameLowering::processFunctionBef
       !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
     assert(RS && "RegScavenger required if spilling");
 
-    // We force this to be at offset 0 so no user object ever has 0 as an
-    // address, so we may use 0 as an invalid pointer value. This is because
-    // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
-    // is required to be address space 0, we are forced to accept this for
-    // now. Ideally we could have the stack in another address space with 0 as a
-    // valid pointer, and -1 as the null value.
-    //
-    // This will also waste additional space when user stack objects require > 4
-    // byte alignment.
-    //
-    // The main cost here is losing the offset for addressing modes. However
-    // this also ensures we shouldn't need a register for the offset when
-    // emergency scavenging.
-    int ScavengeFI = MFI.CreateFixedObject(
-      TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
-    RS->addScavengingFrameIndex(ScavengeFI);
+    if (FuncInfo->isEntryFunction()) {
+      int ScavengeFI = MFI.CreateFixedObject(
+        TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
+      RS->addScavengingFrameIndex(ScavengeFI);
+    } else {
+      int ScavengeFI = MFI.CreateStackObject(
+        TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
+        TRI.getSpillAlignment(AMDGPU::SGPR_32RegClass),
+        false);
+      RS->addScavengingFrameIndex(ScavengeFI);
+    }
   }
 }
 

Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Jun  5 15:37:50 2019
@@ -1940,12 +1940,6 @@ SDValue SITargetLowering::LowerFormalArg
   bool IsKernel = AMDGPU::isKernel(CallConv);
   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
 
-  if (!IsEntryFunc) {
-    // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
-    // this when allocating argument fixed offsets.
-    CCInfo.AllocateStack(4, 4);
-  }
-
   if (IsShader) {
     processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
 
@@ -2551,7 +2545,6 @@ SDValue SITargetLowering::LowerCall(Call
                           "unsupported call from graphics shader of function ");
   }
 
-  // The first 4 bytes are reserved for the callee's emergency stack slot.
   if (IsTailCall) {
     IsTailCall = isEligibleForTailCallOptimization(
       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
@@ -2578,9 +2571,6 @@ SDValue SITargetLowering::LowerCall(Call
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
 
-  // The first 4 bytes are reserved for the callee's emergency stack slot.
-  CCInfo.AllocateStack(4, 4);
-
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
 
   // Get a count of how many bytes are to be pushed on the stack.

Modified: llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll Wed Jun  5 15:37:50 2019
@@ -4,14 +4,14 @@
 %struct.ByValStruct = type { [4 x i32] }
 
 ; GCN-LABEL: {{^}}void_func_byval_struct:
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
 ; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
 ; GCN-NOT: s32
 
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
 ; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:20{{$}}
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
 ; GCN-NOT: s32
 define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
 entry:
@@ -34,16 +34,16 @@ entry:
 ; GCN-DAG: buffer_store_dword v33
 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
 ; GCN-DAG: v_writelane_b32
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
-; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5{{$}}
 
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:20{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:16{{$}}
 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]]
 
 ; GCN: s_swappc_b64
 
-; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:20{{$}}
+; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s5 offset:16{{$}}
 
 ; GCN: v_readlane_b32
 ; GCN-NOT: v_readlane_b32 s32
@@ -74,31 +74,31 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
 
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16
 
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
 
 ; GCN-NOT: s_add_u32 s32, s32, 0x800
 
 
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:16
-
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28
-; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32
-; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36
-
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
+
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
+; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
+
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
 
 ; GCN: s_swappc_b64
 ; GCN-NOT: v_readlane_b32 s32
@@ -144,20 +144,20 @@ entry:
 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
 ; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
 
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
 
 ; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
 ; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
 ; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
 ; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
 
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:20
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:32
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
 
 
 ; GCN: s_swappc_b64
@@ -182,14 +182,14 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_align8:
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}}
 ; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
 ; GCN-NOT: s32
 
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
 ; GCN-NOT: s32
-; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:24{{$}}
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}}
 ; GCN-NOT: s32
 define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 {
 entry:
@@ -222,20 +222,20 @@ entry:
 ; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
 ; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
 
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
 
 ; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
 ; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
 ; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
 ; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
 
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
 
 
 ; GCN: s_swappc_b64
@@ -267,30 +267,30 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
 
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:16
 
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
 
 ; GCN-NOT: s_add_u32 s32, s32, 0x800
 
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
-
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28
-; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32
-; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36
-
-; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
-; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
+
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:16
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:20
+; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
+; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
+; GCN: s_waitcnt vmcnt(0)
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
 
 ; GCN: s_swappc_b64
 ; GCN-NOT: v_readlane_b32 s32

Modified: llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/call-argument-types.ll Wed Jun  5 15:37:50 2019
@@ -636,7 +636,7 @@ define amdgpu_kernel void @test_call_ext
 ; GCN-DAG: buffer_load_dwordx4 v[28:31], off
 
 ; GCN: s_waitcnt
-; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}}
+; GCN: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}}
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
@@ -687,15 +687,15 @@ define amdgpu_kernel void @test_call_ext
 ; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8
 ; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12
 
-; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]] offset:4
-; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:8
+; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}}
+; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4
 
 
 ; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8
 ; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12
 
-; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]] offset:4
-; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:8
+; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}}
+; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4
 
 ; GCN-NEXT: s_swappc_b64
 ; GCN-NOT: [[SP]]
@@ -722,8 +722,8 @@ define amdgpu_kernel void @test_call_ext
 ; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12
 
 ; GCN-NOT: s_add_u32 [[SP]]
-; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
-; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:8
+; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}}
+; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
 ; GCN: s_swappc_b64
 ; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16
 ; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20
@@ -758,8 +758,8 @@ define amdgpu_kernel void @test_call_ext
 }
 
 ; GCN-LABEL: {{^}}stack_passed_arg_alignment_v32i32_f64:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:8
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32{{$}}
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
 entry:
@@ -769,15 +769,15 @@ entry:
 
 ; GCN-LABEL: {{^}}tail_call_byval_align16:
 ; GCN-NOT: s32
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:32
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:36
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:20
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:20
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
 ; GCN: s_getpc_b64
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
@@ -789,15 +789,15 @@ entry:
 
 ; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
 ; GCN-NOT: s32
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
+; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4
 ; GCN: s_getpc_b64
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
@@ -808,13 +808,13 @@ entry:
 
 ; GCN-LABEL: {{^}}stack_12xv3i32:
 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4
+; GCN: buffer_store_dword [[REG12]], {{.*$}}
 ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
 ; GCN: v_mov_b32_e32 v31, 11
 ; GCN: s_getpc
 define void @stack_12xv3i32() #0 {
@@ -837,13 +837,13 @@ entry:
 
 ; GCN-LABEL: {{^}}stack_12xv3f32:
 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4
+; GCN: buffer_store_dword [[REG12]], {{.*$}}
 ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:4
 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:8
 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:12
 ; GCN: v_mov_b32_e32 v31, 0x41300000
 ; GCN: s_getpc
 define void @stack_12xv3f32() #0 {
@@ -867,21 +867,21 @@ entry:
 ; GCN-LABEL: {{^}}stack_8xv5i32:
 
 ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
-; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
+; GCN: buffer_store_dword [[REG8]], {{.*$}}
 ; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
-; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
+; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
 ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
-; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
+; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
-; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
+; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
+; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
 ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
 
 ; GCN: v_mov_b32_e32 v31, 7
 ; GCN: s_getpc
@@ -901,21 +901,21 @@ entry:
 
 ; GCN-LABEL: {{^}}stack_8xv5f32:
 ; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
-; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
+; GCN: buffer_store_dword [[REG8]], {{.*$}}
 ; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
-; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
+; GCN: buffer_store_dword [[REG9]], {{.*}} offset:4
 ; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
-; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
+; GCN: buffer_store_dword [[REG10]], {{.*}} offset:8
 ; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
-; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
+; GCN: buffer_store_dword [[REG11]], {{.*}} offset:12
 ; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
+; GCN: buffer_store_dword [[REG12]], {{.*}} offset:16
 ; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
+; GCN: buffer_store_dword [[REG13]], {{.*}} offset:20
 ; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
+; GCN: buffer_store_dword [[REG14]], {{.*}} offset:24
 ; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
+; GCN: buffer_store_dword [[REG15]], {{.*}} offset:28
 
 ; GCN: v_mov_b32_e32 v31, 0x40e00000
 ; GCN: s_getpc

Modified: llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-frame-setup.ll Wed Jun  5 15:37:50 2019
@@ -23,7 +23,7 @@ define void @callee_no_stack_no_fp_elim(
 ; GCN: ; %bb.0:
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4{{$}}
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}}
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack() #0 {
@@ -37,13 +37,13 @@ define void @callee_with_stack() #0 {
 ; GCN-NEXT: s_waitcnt
 ; GCN: s_mov_b32 s5, s32
 ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
 
 ; GCN-DAG: v_writelane_b32 v32, s33,
 ; GCN-DAG: v_writelane_b32 v32, s34,
 ; GCN-DAG: v_writelane_b32 v32, s35,
 ; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5{{$}}
 ; GCN-DAG: s_mov_b32 s33, s5
 
 
@@ -52,7 +52,7 @@ define void @callee_with_stack() #0 {
 ; GCN-DAG: v_readlane_b32 s35,
 ; GCN-DAG: v_readlane_b32 s34,
 ; GCN-DAG: v_readlane_b32 s33,
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_and_call() #0 {
@@ -72,7 +72,7 @@ define void @callee_with_stack_and_call(
 ; GCN: s_waitcnt
 ; GCN: s_mov_b32 s5, s32
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-DAG: v_writelane_b32 v32, s33, 0
 ; GCN-DAG: v_writelane_b32 v32, s34, 1
@@ -84,7 +84,7 @@ define void @callee_with_stack_and_call(
 ; GCN-DAG: v_readlane_b32 s33, v32, 0
 
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 
 ; GCN: s_sub_u32 s32, s32, 0x400
@@ -99,7 +99,7 @@ declare void @external_void_func_void()
 ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored
 ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 
 ; GCN: v_writelane_b32 v32
@@ -107,7 +107,7 @@ declare void @external_void_func_void()
 ; GCN: v_readlane_b32 s{{[0-9]+}}, v32
 
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 
 ; GCN-NEXT: s_waitcnt

Modified: llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll Wed Jun  5 15:37:50 2019
@@ -116,7 +116,7 @@ define void @use_workgroup_id_x() #1 {
 ; GCN-LABEL: {{^}}use_stack_workgroup_id_x:
 ; GCN: s_waitcnt
 ; GCN-NOT: s32
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
 ; GCN: ; use s6
 ; GCN: s_setpc_b64
 define void @use_stack_workgroup_id_x() #1 {
@@ -429,7 +429,7 @@ define amdgpu_kernel void @kern_indirect
 }
 
 ; GCN-LABEL: {{^}}use_every_sgpr_input:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
 ; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
@@ -577,7 +577,7 @@ define void @func_use_every_sgpr_input_c
 
 ; GCN: s_swappc_b64
 
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]]
 ; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}}

Modified: llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll Wed Jun  5 15:37:50 2019
@@ -230,11 +230,11 @@ define amdgpu_kernel void @kern_indirect
 }
 
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
 
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @too_many_args_use_workitem_id_x(
@@ -289,7 +289,7 @@ define void @too_many_args_use_workitem_
 
 ; GCN: s_mov_b32 s33, s7
 ; GCN: s_mov_b32 s32, s33
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
 ; GCN: s_mov_b32 s4, s33
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
@@ -307,7 +307,7 @@ define amdgpu_kernel void @kern_call_too
 
 ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
 ; GCN: s_mov_b32 s5, s32
-; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:
+; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
 ; GCN: s_swappc_b64
 define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
   store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -326,14 +326,14 @@ define void @func_call_too_many_args_use
 ; Requires loading and storing to stack slot.
 ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
 ; GCN: s_add_u32 s32, s32, 0x400{{$}}
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s5{{$}}
 
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
 
 ; GCN: s_swappc_b64
 
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
 ; GCN: s_sub_u32 s32, s32, 0x400{{$}}
 ; GCN: s_setpc_b64
 define void @too_many_args_call_too_many_args_use_workitem_id_x(
@@ -350,18 +350,17 @@ define void @too_many_args_call_too_many
 }
 
 ; stack layout:
-; frame[0] = emergency stack slot
-; frame[1] = byval arg32
-; frame[2] = stack passed workitem ID x
-; frame[3] = VGPR spill slot
+; frame[0] = byval arg32
+; frame[1] = stack passed workitem ID x
+; frame[2] = VGPR spill slot
 
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
-; GCN: buffer_load_dword v0, off, s[0:3], s32 offset:4
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}}
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; GCN: s_setpc_b64
 define void @too_many_args_use_workitem_id_x_byval(
   i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
@@ -410,13 +409,9 @@ define void @too_many_args_use_workitem_
   ret void
 }
 
-; frame[0] = emergency stack slot
-; frame[1] =
-
-; sp[0] = callee emergency stack slot reservation
-; sp[1] = byval
-; sp[2] = ??
-; sp[3] = stack passed workitem ID x
+; sp[0] = byval
+; sp[1] = ??
+; sp[2] = stack passed workitem ID x
 
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
 ; GCN: enable_vgpr_workitem_id = 0
@@ -427,10 +422,10 @@ define void @too_many_args_use_workitem_
 ; GCN-NOT: s32
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
 
 ; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
-; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
@@ -451,11 +446,11 @@ define amdgpu_kernel void @kern_call_too
 
 ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}}
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
 
-; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4
-; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5{{$}}
+; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
 ; GCN: s_swappc_b64
 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
@@ -475,15 +470,15 @@ define void @func_call_too_many_args_use
 }
 
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
 ; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12{{$}}
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
 
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @too_many_args_use_workitem_id_xyz(
@@ -537,10 +532,9 @@ define void @too_many_args_use_workitem_
   ret void
 }
 
-; frame[0] = callee emergency stack slot
-; frame[1] = ID X
-; frame[2] = ID Y
-; frame[3] = ID Z
+; frame[0] = ID X
+; frame[1] = ID Y
+; frame[2] = ID Z
 
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
 ; GCN: enable_vgpr_workitem_id = 2
@@ -548,9 +542,9 @@ define void @too_many_args_use_workitem_
 ; GCN: s_mov_b32 s33, s7
 ; GCN: s_mov_b32 s32, s33
 
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
   call void @too_many_args_use_workitem_id_xyz(
@@ -567,15 +561,14 @@ define amdgpu_kernel void @kern_call_too
 
 ; workitem ID X in register, yz on stack
 ; v31 = workitem ID X
-; frame[0] = emergency slot
-; frame[1] = workitem Y
-; frame[2] = workitem Z
+; frame[0] = workitem Y
+; frame[1] = workitem Z
 
 ; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_stack_yz:
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
-; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_dword v31, off, s[0:3], s32{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
-; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_load_dword v31, off, s[0:3], s32 offset:4{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+]}}, v31
 
 ; GCN: s_waitcnt
@@ -631,9 +624,8 @@ define void @too_many_args_use_workitem_
   ret void
 }
 
-; frame[0] = callee emergency stack slot
-; frame[1] = ID Y
-; frame[2] = ID Z
+; frame[0] = ID Y
+; frame[1] = ID Z
 
 ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
 ; GCN: enable_vgpr_workitem_id = 2
@@ -642,8 +634,8 @@ define void @too_many_args_use_workitem_
 ; GCN: s_mov_b32 s32, s33
 
 ; GCN-DAG: v_mov_b32_e32 v31, v0
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:4
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
   call void @too_many_args_use_workitem_id_x_stack_yz(

Modified: llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll Wed Jun  5 15:37:50 2019
@@ -30,7 +30,7 @@ define float @call_split_type_used_outsi
 ; GCN-NEXT:    s_mov_b32 s5, s32
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_writelane_b32 v32, s33, 0
 ; GCN-NEXT:    v_writelane_b32 v32, s34, 1
@@ -47,7 +47,7 @@ define float @call_split_type_used_outsi
 ; GCN-NEXT:    v_readlane_b32 s34, v32, 1
 ; GCN-NEXT:    v_readlane_b32 s33, v32, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -68,7 +68,7 @@ define float @call_split_type_used_outsi
 ; GCN-NEXT:    s_mov_b32 s5, s32
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_writelane_b32 v32, s33, 0
 ; GCN-NEXT:    v_writelane_b32 v32, s34, 1
@@ -85,7 +85,7 @@ define float @call_split_type_used_outsi
 ; GCN-NEXT:    v_readlane_b32 s34, v32, 1
 ; GCN-NEXT:    v_readlane_b32 s33, v32, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -106,7 +106,7 @@ define half @call_split_type_used_outsid
 ; GCN-NEXT:    s_mov_b32 s5, s32
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_writelane_b32 v32, s33, 0
 ; GCN-NEXT:    v_writelane_b32 v32, s34, 1
@@ -123,7 +123,7 @@ define half @call_split_type_used_outsid
 ; GCN-NEXT:    v_readlane_b32 s34, v32, 1
 ; GCN-NEXT:    v_readlane_b32 s33, v32, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -144,7 +144,7 @@ define { i32, half } @call_split_type_us
 ; GCN-NEXT:    s_mov_b32 s5, s32
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    v_writelane_b32 v32, s33, 0
 ; GCN-NEXT:    v_writelane_b32 v32, s34, 1
@@ -162,7 +162,7 @@ define { i32, half } @call_split_type_us
 ; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    v_readlane_b32 s33, v32, 0
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_waitcnt vmcnt(0)

Modified: llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/frame-index-elimination.ll Wed Jun  5 15:37:50 2019
@@ -9,11 +9,8 @@
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN: s_sub_u32 s6, s32, s4
 
-; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
-; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
-
-; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
-; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
+; CI-NEXT: v_lshr_b32_e64 v0, s6, 6
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6
 
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -23,6 +20,36 @@ define void @func_mov_fi_i32() #0 {
   ret void
 }
 
+; Offset due to different objects
+; GCN-LABEL: {{^}}func_mov_fi_i32_offset:
+; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+
+; CI: s_sub_u32 s6, s32, s4
+; CI-NEXT: v_lshr_b32_e64 v0, s6, 6
+
+; CI: s_sub_u32 s6, s32, s4
+; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
+; CI-NEXT: v_add_i32_e64 v1, s[6:7], 4, [[SCALED]]
+; CI-NOT: v_mov
+; CI: ds_write_b32 v0, v0
+; CI-NEXT: ds_write_b32 v0, v1
+
+; GFX9: s_sub_u32 s6, s32, s4
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6
+; GFX9-DAG: ds_write_b32 v0, v0
+
+; GFX9-DAG: s_sub_u32 s6, s32, s4
+; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
+; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
+; GFX9-NEXT: ds_write_b32 v0, v0
+define void @func_mov_fi_i32_offset() #0 {
+  %alloca0 = alloca i32, addrspace(5)
+  %alloca1 = alloca i32, addrspace(5)
+  store volatile i32 addrspace(5)* %alloca0, i32 addrspace(5)* addrspace(3)* undef
+  store volatile i32 addrspace(5)* %alloca1, i32 addrspace(5)* addrspace(3)* undef
+  ret void
+}
+
 ; Materialize into an add of a constant offset from the FI.
 ; FIXME: Should be able to merge adds
 
@@ -31,12 +58,10 @@ define void @func_mov_fi_i32() #0 {
 ; GCN: s_sub_u32 s6, s32, s4
 
 ; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
-; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
-; CI-NEXT: v_add_i32_e32 v0, vcc, 4, v0
+; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
 
 ; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
-; GFX9-NEXT: v_add_u32_e32 v0, 4, v0
 
 
 ; GCN-NOT: v_mov
@@ -54,11 +79,9 @@ define void @func_add_constant_to_fi_i32
 ; GCN-LABEL: {{^}}func_other_fi_user_i32:
 ; GCN: s_sub_u32 s6, s32, s4
 
-; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
-; CI-NEXT: v_add_i32_e64 v0, s[6:7], 4, [[SCALED]]
+; CI-NEXT: v_lshr_b32_e64 v0, s6, 6
 
-; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s6
-; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
+; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s6
 
 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0
 ; GCN-NOT: v_mov
@@ -92,12 +115,10 @@ define void @func_load_private_arg_i32_p
 ; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4
 
 ; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
-; CI-NEXT: v_add_i32_e64 [[ADD:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]]
-; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[ADD]]
+; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
 
 ; GFX9-NEXT: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
-; GFX9-NEXT: v_add_u32_e32 [[ADD:v[0-9]+]], 4, [[SHIFT]]
-; GFX9-NEXT: v_add_u32_e32 v0, 4, [[ADD]]
+; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
 
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -130,17 +151,15 @@ define void @void_func_byval_struct_i8_i
 ; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4
 
 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
-; CI: v_add_i32_e64 [[ADD:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 4, [[SHIFT]]
 
 ; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
-; GFX9: v_add_u32_e32 [[ADD:v[0-9]+]], 4, [[SHIFT]]
 
 ; GCN: s_and_saveexec_b64
 
-; CI: v_add_i32_e32 v0, vcc, 4, [[ADD]]
+; CI: v_add_i32_e32 v0, vcc, 4, [[SHIFT]]
 ; CI: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4{{$}}
 
-; GFX9: v_add_u32_e32 v0, 4, [[ADD]]
+; GFX9: v_add_u32_e32 v0, 4, [[SHIFT]]
 ; GFX9: buffer_load_dword v1, v{{[0-9]+}}, s[0:3], s4 offen offset:4{{$}}
 
 ; GCN: ds_write_b32
@@ -162,7 +181,7 @@ ret:
 ; Added offset can't be used with VOP3 add
 ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32:
 ; GCN: s_sub_u32 s6, s32, s4
-; GCN-DAG: s_movk_i32 s6, 0x204
+; GCN-DAG: s_movk_i32 s6, 0x200
 
 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s6, 6
 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s[6:7], s6, [[SCALED]]
@@ -186,7 +205,7 @@ define void @func_other_fi_user_non_inli
 
 ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live:
 ; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s4
-; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x204
+; GCN-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200
 
 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6
 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
@@ -241,7 +260,16 @@ bb5:
 
 ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:12
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s4
+
+; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
+; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
+
+; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]]
+; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
+
+; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]]
 define void @alloca_ptr_nonentry_block(i32 %arg0) #0 {
   %alloca0 = alloca { i8, i32 }, align 4, addrspace(5)
   %cmp = icmp eq i32 %arg0, 0

Modified: llvm/trunk/test/CodeGen/AMDGPU/function-args.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/function-args.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/function-args.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/function-args.ll Wed Jun  5 15:37:50 2019
@@ -516,8 +516,8 @@ define void @void_func_struct_i8_i32({ i
 }
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32:
-; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
 ; GCN-DAG: buffer_store_dword v[[ELT1]]
 ; GCN-DAG: buffer_store_byte v[[ELT0]]
 define void @void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval %arg0) #0 {
@@ -527,10 +527,10 @@ define void @void_func_byval_struct_i8_i
 }
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
-; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
 
 ; GCN: ds_write_b32 v0, v0
 ; GCN: s_setpc_b64
@@ -544,7 +544,7 @@ define void @void_func_byval_struct_i8_i
 }
 
 ; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64:
-; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32{{$}}
 ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
 ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
 ; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
@@ -566,9 +566,9 @@ define void @void_func_byval_i32_byval_i
 ; GCN-DAG: buffer_store_dwordx4 v[20:23], off
 ; GCN-DAG: buffer_store_dwordx4 v[24:27], off
 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:8
 
 ; GCN: buffer_store_dword v[[LOAD_ARG1]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
@@ -581,14 +581,14 @@ define void @void_func_v32i32_i32_i64(<3
 
 ; FIXME: Different ext load types on CI vs. VI
 ; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16:
-; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-
-; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
+
+; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
 
 ; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]]
 ; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]]
@@ -609,10 +609,10 @@ define void @void_func_v32i32_i1_i8_i16(
 }
 
 ; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
 
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
@@ -636,15 +636,15 @@ define void @void_func_v32i32_v2i16_v2f1
 }
 
 ; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
@@ -656,15 +656,15 @@ define void @void_func_v32i32_v2i64_v2f6
 }
 
 ; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
@@ -676,23 +676,23 @@ define void @void_func_v32i32_v4i32_v4f3
 }
 
 ; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
@@ -706,39 +706,39 @@ define void @void_func_v32i32_v8i32_v8f3
 }
 
 ; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:64{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:68{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:72{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:76{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:80{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:84{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:88{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:92{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:96{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:100{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:104{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:108{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:112{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:116{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:120{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:124{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:128{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
+
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:64{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:68{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:72{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:76{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:80{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:84{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:88{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:92{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:96{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:100{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:104{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:108{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:112{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:116{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:120{{$}}
+; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:124{{$}}
 define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
   store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
   store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef

Modified: llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/load-hi16.ll Wed Jun  5 15:37:50 2019
@@ -503,7 +503,7 @@ entry:
 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
 define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
+  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
   %load = load i16, i16 addrspace(5)* %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
@@ -522,7 +522,7 @@ entry:
 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
 define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 {
 entry:
-  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045
+  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
   %load = load half, half addrspace(5)* %gep
   %build0 = insertelement <2 x half> undef, half %reg, i32 0
   %build1 = insertelement <2 x half> %build0, half %load, i32 1
@@ -577,7 +577,7 @@ entry:
 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
   %ext = zext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -597,7 +597,7 @@ entry:
 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
   %ext = zext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
@@ -618,7 +618,7 @@ entry:
 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
 define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
   %ext = sext i8 %load to i16
   %bitcast = bitcast i16 %ext to half
@@ -639,7 +639,7 @@ entry:
 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
 define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
 entry:
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
   %ext = sext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -796,7 +796,7 @@ entry:
   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
+  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
   %load = load i16, i16 addrspace(5)* %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
@@ -813,7 +813,7 @@ entry:
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
   %load = load i8, i8 addrspace(5)* %gep
   %ext = sext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -831,7 +831,7 @@ entry:
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
   %load = load i8, i8 addrspace(5)* %gep
   %ext = zext i8 %load to i16
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
@@ -975,9 +975,9 @@ entry:
 ; FIXME: Is there a cost to using the extload over not?
 ; GCN-LABEL: {{^}}load_private_v2i16_split:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_ushort v0, off, s[0:3], s32 offset:4{{$}}
+; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}}
 ; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:6
+; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64
 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 {

Modified: llvm/trunk/test/CodeGen/AMDGPU/load-lo16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/load-lo16.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/load-lo16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/load-lo16.ll Wed Jun  5 15:37:50 2019
@@ -600,7 +600,7 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 {
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
+  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
   %load = load i16, i16 addrspace(5)* %gep
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
@@ -621,7 +621,7 @@ entry:
 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
 define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
 entry:
-  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
+  %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
   %load = load i16, i16 addrspace(5)* %gep
   %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
   %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
@@ -641,7 +641,7 @@ entry:
 define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 {
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
-  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045
+  %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
   %load = load half, half addrspace(5)* %gep
   %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
   store <2 x half> %build1, <2 x half> addrspace(1)* undef
@@ -714,7 +714,7 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
   %ext = zext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
@@ -734,7 +734,7 @@ entry:
 define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
-  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
+  %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
   %load = load i8, i8 addrspace(5)* %gep
   %ext = sext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
@@ -905,7 +905,7 @@ entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
+  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
   %load = load volatile i16, i16 addrspace(5)* %gep
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
   store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
@@ -924,7 +924,7 @@ entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
   %load = load volatile i8, i8 addrspace(5)* %gep
   %load.ext = sext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
@@ -944,7 +944,7 @@ entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
   %load = load volatile i8, i8 addrspace(5)* %gep
   %load.ext = zext i8 %load to i16
   %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
@@ -964,7 +964,7 @@ entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
   %load = load volatile i8, i8 addrspace(5)* %gep
   %load.ext = sext i8 %load to i16
   %bitcast = bitcast i16 %load.ext to half
@@ -985,7 +985,7 @@ entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
   %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
   store volatile i32 123, i32 addrspace(5)* %bc
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
   %load = load volatile i8, i8 addrspace(5)* %gep
   %load.ext = zext i8 %load to i16
   %bitcast = bitcast i16 %load.ext to half

Modified: llvm/trunk/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll Wed Jun  5 15:37:50 2019
@@ -1,12 +1,12 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
 
 ; CHECK-LABEL: spill_v2i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
 
 define void @spill_v2i32() {
 entry:
@@ -25,12 +25,12 @@ entry:
 }
 
 ; CHECK-LABEL: spill_v2f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:24 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:28 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:24 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:28 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
 
 define void @spill_v2f32() {
 entry:
@@ -49,14 +49,14 @@ entry:
 }
 
 ; CHECK-LABEL: spill_v3i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
 
 define void @spill_v3i32() {
 entry:
@@ -75,14 +75,14 @@ entry:
 }
 
 ; CHECK-LABEL: spill_v3f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
 
 define void @spill_v3f32() {
 entry:
@@ -101,16 +101,16 @@ entry:
 }
 
 ; CHECK-LABEL: spill_v4i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
 
 define void @spill_v4i32() {
 entry:
@@ -129,16 +129,16 @@ entry:
 }
 
 ; CHECK-LABEL: spill_v4f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:48 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:52 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:56 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:60 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:48 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:52 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:56 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:60 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
 
 define void @spill_v4f32() {
 entry:
@@ -157,17 +157,16 @@ entry:
 }
 
 ; CHECK-LABEL: spill_v5i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload
-
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
 define void @spill_v5i32() {
 entry:
   %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
@@ -185,17 +184,16 @@ entry:
 }
 
 ; CHECK-LABEL: spill_v5f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:96 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:100 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:104 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:108 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill
+; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill
 ; CHECK: ;;#ASMSTART
 ; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:96 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:100 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:104 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:108 ; 4-byte Folded Reload
-
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
+; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
 define void @spill_v5f32() {
 entry:
   %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
@@ -211,6 +209,3 @@ entry:
 
   ret void
 }
-
-
-

Modified: llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/nested-calls.ll Wed Jun  5 15:37:50 2019
@@ -13,7 +13,7 @@ declare void @external_void_func_i32(i32
 ; GCN-DAG: s_add_u32 s32, s32, 0x400
 ; Spill CSR VGPR used for SGPR spilling
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 
 ; GCN-DAG: v_writelane_b32 v32, s33, 0
@@ -26,7 +26,7 @@ declare void @external_void_func_i32(i32
 ; GCN: v_readlane_b32 s34, v32, 1
 ; GCN: v_readlane_b32 s33, v32, 0
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 
 ; GCN: s_sub_u32 s32, s32, 0x400

Modified: llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/sibling-call.ll Wed Jun  5 15:37:50 2019
@@ -19,7 +19,7 @@ define fastcc i32 @i32_fastcc_i32_i32(i3
 ; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9
 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:24
+; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN: s_setpc_b64
 ; GCN: ; ScratchSize: 68
@@ -40,7 +40,7 @@ entry:
 
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
 ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
 ; GCN: s_setpc_b64
 ; GCN: ; ScratchSize: 68
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
@@ -54,7 +54,7 @@ entry:
 
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object:
 ; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:24
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
 ; GCN: s_setpc_b64
 ; GCN: ; ScratchSize: 136
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
@@ -84,7 +84,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
 ; GCN: s_waitcnt
-; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}}
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 
 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
@@ -100,7 +100,7 @@ define hidden fastcc i32 @i32_fastcc_i32
 ; Tail call disallowed with byval in parent.
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
 ; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
 ; GCN: s_swappc_b64
 ; GCN-NOT: v_readlane_b32 s32
 ; GCN: s_setpc_b64
@@ -110,14 +110,17 @@ entry:
   ret i32 %ret
 }
 
-; Tail call disallowed with byval in parent, not callee.
+; Tail call disallowed with byval in parent, not callee. The stack
+; usage of incoming arguments must be <= the outgoing stack
+; arguments.
+
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
 ; GCN-NOT: v0
 ; GCN-NOT: s32
 ; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16
-; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
 ; GCN-NEXT: s_setpc_b64
-define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 {
+define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32 addrspace(5)* inttoptr (i32 16 to i32 addrspace(5)*))
   ret i32 %ret
@@ -125,8 +128,8 @@ entry:
 
 ; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
 
 ; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
 ; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
@@ -149,19 +152,19 @@ define fastcc i32 @i32_fastcc_i32_i32_a3
 ; FIXME: Why load and store same location for stack args?
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
 
-; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 
-; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
 
 ; GCN-NOT: s32
 
-; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
 
-; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 
 ; GCN-NOT: s32
 ; GCN: s_setpc_b64
@@ -173,7 +176,7 @@ entry:
 
 ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:44
+; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:40
 ; GCN: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
 entry:
@@ -203,11 +206,11 @@ entry:
 ; GCN: s_add_u32 s32, s32, 0x400
 
 ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:12
+; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:8
 ; GCN-NEXT: s_mov_b64 exec
 
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s5 ; 4-byte Folded Spill
 ; GCN-DAG: v_writelane_b32 v34, s33, 0
 ; GCN-DAG: v_writelane_b32 v34, s34, 1
 
@@ -221,10 +224,10 @@ entry:
 ; GCN-DAG: v_readlane_b32 s33, v34, 0
 ; GCN-DAG: v_readlane_b32 s34, v34, 1
 
-; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
+; GCN: buffer_load_dword v33, off, s[0:3], s5 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
 ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:12
+; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:8
 ; GCN-NEXT: s_mov_b64 exec
 
 ; GCN: s_sub_u32 s32, s32, 0x400
@@ -258,7 +261,7 @@ entry:
 
 ; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
 ; GCN-NOT: s33
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:44
 
 ; GCN-NOT: s33
 ; GCN: s_setpc_b64 s[6:7]

Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll Wed Jun  5 15:37:50 2019
@@ -4,13 +4,13 @@
 ; storeRegToStackSlot.
 
 ; GCN-LABEL: {{^}}spill_csr_s5_copy:
-; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
 ; GCN: v_writelane_b32 v32, s5, 2
 ; GCN: s_swappc_b64
 ; GCN: v_readlane_b32 s5, v32, 2
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
-; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
-; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Reload
+; GCN: buffer_store_dword [[K]], off, s[0:3], s5{{$}}
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
 ; GCN: s_setpc_b64
 define void @spill_csr_s5_copy() #0 {
 bb:

Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-offset-calculation.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-offset-calculation.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-offset-calculation.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-offset-calculation.ll Wed Jun  5 15:37:50 2019
@@ -82,6 +82,55 @@ entry:
   ret void
 }
 
+; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail
+define void @test_sgpr_offset_function_scavenge_fail() #2 {
+entry:
+  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
+  ; fit in the instruction, and has to live in the SGPR offset.
+  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
+  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
+
+  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
+
+  %asm.0 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
+  %asm0.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 0
+  %asm1.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 1
+  %asm2.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 2
+  %asm3.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 3
+  %asm4.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 4
+  %asm5.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 5
+  %asm6.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 6
+  %asm7.0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm.0, 7
+
+  ; 0x40000 / 64 = 4096 (for wave64)
+  %a = load volatile i32, i32 addrspace(5)* %aptr
+
+  ; CHECK: s_add_u32 s32, s32, 0x40000
+  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Spill
+  ; CHECK: s_sub_u32 s32, s32, 0x40000
+  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0.0, i32 %asm1.0, i32 %asm2.0, i32 %asm3.0, i32 %asm4.0, i32 %asm5.0, i32 %asm6.0, i32 %asm7.0, i32 %a)
+
+  %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"()
+  %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0
+  %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1
+  %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2
+  %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3
+  %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4
+  %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5
+  %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6
+  %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7
+
+  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
+
+  ; CHECK: s_add_u32 s32, s32, 0x40000
+  ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s32 ; 4-byte Folded Reload
+  ; CHECK: s_sub_u32 s32, s32, 0x40000
+
+   ; Force %a to spill with no free SGPRs
+  call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a)
+  ret void
+}
+
 ; CHECK-LABEL: test_sgpr_offset_subregs_kernel
 define amdgpu_kernel void @test_sgpr_offset_subregs_kernel() {
 entry:
@@ -145,7 +194,7 @@ define void @test_inst_offset_function()
 entry:
   ; Occupy 4092 bytes of scratch, so the offset of the spill of %a just fits in
   ; the instruction offset field.
-  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
+  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
@@ -166,7 +215,7 @@ define void @test_sgpr_offset_function()
 entry:
   ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
   ; fit in the instruction, and has to live in the SGPR offset.
-  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
+  %alloca = alloca i8, i32 4096, align 4, addrspace(5)
   %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
 
   %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
@@ -190,7 +239,7 @@ entry:
   ; Occupy 4088 bytes of scratch, so that the spill of the last subreg of %a
   ; still fits below offset 4096 (4088 + 8 - 4 = 4092), and can be placed in
   ; the instruction offset field.
-  %alloca = alloca i8, i32 4084, align 4, addrspace(5)
+  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 
@@ -218,7 +267,7 @@ entry:
   ; Occupy 4092 bytes of scratch, so that the spill of the last subreg of %a
   ; does not fit below offset 4096 (4092 + 8 - 4 = 4096), and has to live
   ; in the SGPR offset.
-  %alloca = alloca i8, i32 4088, align 4, addrspace(5)
+  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
   %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
   %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)*
 
@@ -244,3 +293,4 @@ entry:
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind "amdgpu-num-sgpr"="18" "amdgpu-num-vgpr"="8" }
+attributes #2 = { nounwind "amdgpu-num-sgpr"="16" "amdgpu-num-vgpr"="8" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/stack-realign.ll Wed Jun  5 15:37:50 2019
@@ -10,8 +10,9 @@
 
 ; GCN-LABEL: {{^}}needs_align16_default_stack_align:
 ; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s4
-; GCN-NEXT: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]]
-; GCN: v_add_u32_e64 [[FI:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 16, [[FRAMEDIFF]]
+; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0
+; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]]
+; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[FRAMEDIFF]], [[SCALED_IDX]]
 
 ; GCN-NOT: s32
 
@@ -126,10 +127,10 @@ define amdgpu_kernel void @kernel_call_a
 ; GCN-LABEL: {{^}}default_realign_align128:
 ; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0
 ; GCN-NEXT: s_and_b32 s5, [[TMP]], 0xffffe000
-; GCN-NEXT: s_add_u32 s32, s32, 0x6000
+; GCN-NEXT: s_add_u32 s32, s32, 0x4000
 ; GCN-NOT: s5
-; GCN: buffer_store_dword v0, off, s[0:3], s5 offset:128
-; GCN: s_sub_u32 s32, s32, 0x6000
+; GCN: buffer_store_dword v0, off, s[0:3], s5{{$}}
+; GCN: s_sub_u32 s32, s32, 0x4000
 define void @default_realign_align128(i32 %idx) #0 {
   %alloca.align = alloca i32, align 128, addrspace(5)
   store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128
@@ -138,7 +139,7 @@ define void @default_realign_align128(i3
 
 ; GCN-LABEL: {{^}}disable_realign_align128:
 ; GCN-NOT: s32
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
 ; GCN-NOT: s32
 define void @disable_realign_align128(i32 %idx) #3 {
   %alloca.align = alloca i32, align 128, addrspace(5)

Modified: llvm/trunk/test/CodeGen/AMDGPU/store-hi16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/store-hi16.ll?rev=362665&r1=362664&r2=362665&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/store-hi16.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/store-hi16.ll Wed Jun  5 15:37:50 2019
@@ -492,7 +492,7 @@ define void @store_private_hi_v2i16_max_
 entry:
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2045
+  %gep = getelementptr inbounds i16, i16 addrspace(5)* %out, i64 2047
   store i16 %hi, i16 addrspace(5)* %gep
   ret void
 }
@@ -644,7 +644,7 @@ entry:
   store volatile i32 123, i32 addrspace(5)* %bc
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
+  %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
   store i16 %hi, i16 addrspace(5)* %gep
   ret void
 }
@@ -661,7 +661,7 @@ entry:
   store volatile i32 123, i32 addrspace(5)* %bc
   %value = bitcast i32 %arg to <2 x i16>
   %hi = extractelement <2 x i16> %value, i32 1
-  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
+  %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
   %trunc = trunc i16 %hi to i8
   store i8 %trunc, i8 addrspace(5)* %gep
   ret void




More information about the llvm-commits mailing list