[llvm] r340396 - AMDGPU: Fix not respecting byval alignment in call frame setup
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 22 04:09:45 PDT 2018
Author: arsenm
Date: Wed Aug 22 04:09:45 2018
New Revision: 340396
URL: http://llvm.org/viewvc/llvm-project?rev=340396&view=rev
Log:
AMDGPU: Fix not respecting byval alignment in call frame setup
This was hackily adding in the 4-bytes reserved for the callee's
emergency stack slot. Treat it like a normal stack allocation
so we get the correct alignment padding behavior. This fixes
an inconsistency between the caller and callee.
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=340396&r1=340395&r2=340396&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Wed Aug 22 04:09:45 2018
@@ -4003,13 +4003,12 @@ SDValue AMDGPUTargetLowering::loadStackI
SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
- SDValue StackPtr,
SDValue ArgVal,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
- SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
+ SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
MachineMemOperand::MODereferenceable);
return Store;
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=340396&r1=340395&r2=340396&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Wed Aug 22 04:09:45 2018
@@ -287,7 +287,6 @@ public:
SDValue storeStackInputValue(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
- SDValue StackPtr,
SDValue ArgVal,
int64_t Offset) const;
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=340396&r1=340395&r2=340396&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Aug 22 04:09:45 2018
@@ -2181,11 +2181,11 @@ SDValue SITargetLowering::LowerCallResul
// from the explicit user arguments present in the IR.
void SITargetLowering::passSpecialInputs(
CallLoweringInfo &CLI,
+ CCState &CCInfo,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
- SDValue Chain,
- SDValue StackPtr) const {
+ SDValue Chain) const {
// If we don't have a call site, this was a call inserted by
// legalization. These can never use special inputs.
if (!CLI.CS)
@@ -2253,9 +2253,9 @@ void SITargetLowering::passSpecialInputs
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
} else {
- SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
- InputReg,
- OutgoingArg->getStackOffset());
+ unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+ SpecialArgOffset);
MemOpChains.push_back(ArgStore);
}
}
@@ -2401,8 +2401,6 @@ SDValue SITargetLowering::LowerCall(Call
}
// The first 4 bytes are reserved for the callee's emergency stack slot.
- const unsigned CalleeUsableStackOffset = 4;
-
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
@@ -2441,6 +2439,10 @@ SDValue SITargetLowering::LowerCall(Call
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+
+ // The first 4 bytes are reserved for the callee's emergency stack slot.
+ CCInfo.AllocateStack(4, 4);
+
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
@@ -2488,10 +2490,6 @@ SDValue SITargetLowering::LowerCall(Call
}
}
- // Stack pointer relative accesses are done by changing the offset SGPR. This
- // is just the VGPR offset component.
- SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
-
SmallVector<SDValue, 8> MemOpChains;
MVT PtrVT = MVT::i32;
@@ -2535,7 +2533,7 @@ SDValue SITargetLowering::LowerCall(Call
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset;
- SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
+ SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
if (IsTailCall) {
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
@@ -2545,8 +2543,7 @@ SDValue SITargetLowering::LowerCall(Call
Offset = Offset + FPDiff;
int FI = MFI.CreateFixedObject(OpSize, Offset, true);
- DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
- StackPtr);
+ DstAddr = DAG.getFrameIndex(FI, PtrVT);
DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
// Make sure any stack arguments overlapping with where we're storing
@@ -2581,7 +2578,7 @@ SDValue SITargetLowering::LowerCall(Call
}
// Copy special input registers after user input arguments.
- passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
+ passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h?rev=340396&r1=340395&r2=340396&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h Wed Aug 22 04:09:45 2018
@@ -265,11 +265,11 @@ public:
void passSpecialInputs(
CallLoweringInfo &CLI,
+ CCState &CCInfo,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
- SDValue Chain,
- SDValue StackPtr) const;
+ SDValue Chain) const;
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,
Modified: llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll?rev=340396&r1=340395&r2=340396&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/byval-frame-setup.ll Wed Aug 22 04:09:45 2018
@@ -110,7 +110,7 @@ entry:
; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
-define void @call_void_func_byval_struct_func() #0 {
+define void @call_void_func_byval_struct_func() #1 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
%arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
@@ -163,7 +163,7 @@ entry:
; GCN: s_swappc_b64
; GCN-NOT: s_sub_u32 s32
; GCN: s_endpgm
-define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 {
+define amdgpu_kernel void @call_void_func_byval_struct_kernel() #1 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
%arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
@@ -179,6 +179,146 @@ entry:
call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
ret void
+}
+
+; GCN-LABEL: {{^}}void_func_byval_struct_align8:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5 offset:8{{$}}
+; GCN-NOT: s32
+
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:24{{$}}
+; GCN-NOT: s32
+; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:24{{$}}
+; GCN-NOT: s32
+define void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 {
+entry:
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+ %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8
+ %add = add nsw i32 %tmp, 1
+ store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+ %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 8
+ %add3 = add nsw i32 %tmp1, 2
+ store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 8
+ store volatile i32 9, i32 addrspace(1)* null, align 4
+ ret void
+}
+
+; Make sure the byval alignment is respected in the call frame setup
+; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_kernel:
+; GCN: s_mov_b32 s33, s7
+; GCN: s_add_u32 s32, s33, 0xc00{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8
+; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24
+
+; GCN-NOT: s_add_u32 s32, s32, 0x800
+
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
+
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
+
+; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
+; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
+; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
+; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
+
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
+
+
+; GCN: s_swappc_b64
+; GCN-NOT: s_sub_u32 s32
+; GCN: s_endpgm
+define amdgpu_kernel void @call_void_func_byval_struct_align8_kernel() #1 {
+entry:
+ %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5)
+ %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5)
+ %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
+ call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
+ %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
+ call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+ store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+ store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8
+ call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1)
+ call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
+ call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
+ ret void
+}
+
+; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func:
+; GCN: s_mov_b32 s5, s32
+; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
+; GCN-DAG: v_writelane_b32
+
+; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
+; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
+
+; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
+
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
+
+; GCN-NOT: s_add_u32 s32, s32, 0x800
+
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
+
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28
+; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32
+; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36
+
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
+
+; GCN: s_swappc_b64
+; GCN-NOT: v_readlane_b32 s32
+; GCN: v_readlane_b32
+; GCN-NOT: v_readlane_b32 s32
+
+; GCN-NOT: s_sub_u32 s32, s32, 0x800
+
+; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @call_void_func_byval_struct_align8_func() #0 {
+entry:
+ %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5)
+ %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5)
+ %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
+ call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
+ %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
+ call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
+ %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
+ store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8
+ %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
+ store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8
+ call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1)
+ call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
+ call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
+ ret void
}
; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim:
Modified: llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll?rev=340396&r1=340395&r2=340396&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll Wed Aug 22 04:09:45 2018
@@ -290,7 +290,7 @@ define void @too_many_args_use_workitem_
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GCN: s_mov_b32 s4, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
@@ -308,7 +308,7 @@ define amdgpu_kernel void @kern_call_too
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s5, s32
-; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -330,7 +330,7 @@ define void @func_call_too_many_args_use
; GCN: s_add_u32 s32, s32, 0x400{{$}}
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}}
+; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}}
; GCN: s_swappc_b64
@@ -428,7 +428,7 @@ define void @too_many_args_use_workitem_
; GCN-NOT: s32
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
@@ -453,7 +453,7 @@ define amdgpu_kernel void @kern_call_too
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
@@ -539,11 +539,10 @@ define void @too_many_args_use_workitem_
ret void
}
-; frame[0] = kernel emergency stack slot
-; frame[1] = callee emergency stack slot
-; frame[2] = ID X
-; frame[3] = ID Y
-; frame[4] = ID Z
+; frame[0] = callee emergency stack slot
+; frame[1] = ID X
+; frame[2] = ID Y
+; frame[3] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
; GCN: enable_vgpr_workitem_id = 2
@@ -551,9 +550,9 @@ define void @too_many_args_use_workitem_
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
call void @too_many_args_use_workitem_id_xyz(
@@ -635,10 +634,9 @@ define void @too_many_args_use_workitem_
ret void
}
-; frame[0] = kernel emergency stack slot
-; frame[1] = callee emergency stack slot
-; frame[2] = ID Y
-; frame[3] = ID Z
+; frame[0] = callee emergency stack slot
+; frame[1] = ID Y
+; frame[2] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
; GCN: enable_vgpr_workitem_id = 2
@@ -647,8 +645,8 @@ define void @too_many_args_use_workitem_
; GCN: s_mov_b32 s32, s33
; GCN-DAG: v_mov_b32_e32 v31, v0
-; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
+; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
call void @too_many_args_use_workitem_id_x_stack_yz(
More information about the llvm-commits
mailing list