[llvm] 015b640 - AMDGPU: Add flag to used fixed function ABI
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 13:27:19 PDT 2020
Author: Matt Arsenault
Date: 2020-03-13T13:27:05-07:00
New Revision: 015b640be4c6ce35969be5c38d628feeadb48634
URL: https://github.com/llvm/llvm-project/commit/015b640be4c6ce35969be5c38d628feeadb48634
DIFF: https://github.com/llvm/llvm-project/commit/015b640be4c6ce35969be5c38d628feeadb48634.diff
LOG: AMDGPU: Add flag to used fixed function ABI
Pass all arguments to every function, rather than only passing the
minimum set of inputs needed for the call graph.
Added:
llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
Modified:
llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index 99a01ca3a2fd..69e48227e732 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -8,6 +8,8 @@
#include "AMDGPU.h"
#include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIRegisterInfo.h"
#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/raw_ostream.h"
@@ -43,6 +45,10 @@ char AMDGPUArgumentUsageInfo::ID = 0;
const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
+// Hardcoded registers from fixed function ABI
+const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo
+ = AMDGPUFunctionArgInfo::fixedABILayout();
+
bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) {
return false;
}
@@ -133,3 +139,41 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
}
llvm_unreachable("unexpected preloaded value type");
}
+
+constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
+ AMDGPUFunctionArgInfo AI;
+ AI.PrivateSegmentBuffer = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+ AI.DispatchPtr = AMDGPU::SGPR4_SGPR5;
+ AI.QueuePtr = AMDGPU::SGPR6_SGPR7;
+
+ // Do not pass kernarg segment pointer, only pass increment version in its
+ // place.
+ AI.ImplicitArgPtr = AMDGPU::SGPR8_SGPR9;
+ AI.DispatchID = AMDGPU::SGPR10_SGPR11;
+
+ // Skip FlatScratchInit/PrivateSegmentSize
+ AI.WorkGroupIDX = AMDGPU::SGPR12;
+ AI.WorkGroupIDY = AMDGPU::SGPR13;
+ AI.WorkGroupIDZ = AMDGPU::SGPR14;
+
+ const unsigned Mask = 0x3ff;
+ AI.WorkItemIDX = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask);
+ AI.WorkItemIDY = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 10);
+ AI.WorkItemIDZ = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 20);
+ return AI;
+}
+
+const AMDGPUFunctionArgInfo &
+AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
+ auto I = ArgInfoMap.find(&F);
+ if (I == ArgInfoMap.end()) {
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI)
+ return FixedABIFunctionInfo;
+
+ // Without the fixed ABI, we assume no function has special inputs.
+ assert(F.isDeclaration());
+ return ExternFunctionInfo;
+ }
+
+ return I->second;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index f0e7ee910f95..d3135593bf1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -40,19 +40,22 @@ struct ArgDescriptor {
bool IsSet : 1;
public:
- ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
+ constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
bool IsStack = false, bool IsSet = false)
: Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
- static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
+ static constexpr ArgDescriptor createRegister(Register Reg,
+ unsigned Mask = ~0u) {
return ArgDescriptor(Reg, Mask, false, true);
}
- static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
+ static constexpr ArgDescriptor createStack(unsigned Offset,
+ unsigned Mask = ~0u) {
return ArgDescriptor(Offset, Mask, true, true);
}
- static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
+ static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
+ unsigned Mask) {
return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
}
@@ -141,7 +144,7 @@ struct AMDGPUFunctionArgInfo {
ArgDescriptor ImplicitArgPtr;
// Input registers for non-HSA ABI
- ArgDescriptor ImplicitBufferPtr = 0;
+ ArgDescriptor ImplicitBufferPtr;
// VGPRs inputs. These are always v0, v1 and v2 for entry functions.
ArgDescriptor WorkItemIDX;
@@ -150,11 +153,14 @@ struct AMDGPUFunctionArgInfo {
std::pair<const ArgDescriptor *, const TargetRegisterClass *>
getPreloadedValue(PreloadedValue Value) const;
+
+ static constexpr AMDGPUFunctionArgInfo fixedABILayout();
};
class AMDGPUArgumentUsageInfo : public ImmutablePass {
private:
static const AMDGPUFunctionArgInfo ExternFunctionInfo;
+ static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
public:
@@ -175,15 +181,7 @@ class AMDGPUArgumentUsageInfo : public ImmutablePass {
ArgInfoMap[&F] = ArgInfo;
}
- const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
- auto I = ArgInfoMap.find(&F);
- if (I == ArgInfoMap.end()) {
- assert(F.isDeclaration());
- return ExternFunctionInfo;
- }
-
- return I->second;
- }
+ const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 555b215d8e5e..7553616a86dc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -139,6 +139,13 @@ static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
cl::init(true),
cl::Hidden);
+static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
+ "amdgpu-fixed-function-abi",
+ cl::desc("Enable all implicit function arguments"),
+ cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
+ cl::init(false),
+ cl::Hidden);
+
// Enable lib calls simplifications
static cl::opt<bool> EnableLibCallSimplify(
"amdgpu-simplify-libcall",
@@ -372,6 +379,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 70fa3961236f..2ef6cd5b3e33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -39,6 +39,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
public:
static bool EnableLateStructurizeCFG;
static bool EnableFunctionCalls;
+ static bool EnableFixedFunctionABI;
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c3efa23795f4..991e0161e7fc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1688,10 +1688,11 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
}
-void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) const {
+/// Allocate implicit function VGPR arguments at the end of allocated user
+/// arguments.
+void SITargetLowering::allocateSpecialInputVGPRs(
+ CCState &CCInfo, MachineFunction &MF,
+ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
const unsigned Mask = 0x3ff;
ArgDescriptor Arg;
@@ -1709,6 +1710,20 @@ void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
}
+/// Allocate implicit function VGPR arguments in fixed registers.
+void SITargetLowering::allocateSpecialInputVGPRsFixed(
+ CCState &CCInfo, MachineFunction &MF,
+ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
+ Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
+ if (!Reg)
+ report_fatal_error("failed to allocated VGPR for implicit arguments");
+
+ const unsigned Mask = 0x3ff;
+ Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
+ Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
+ Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
+}
+
void SITargetLowering::allocateSpecialInputSGPRs(
CCState &CCInfo,
MachineFunction &MF,
@@ -2091,6 +2106,10 @@ SDValue SITargetLowering::LowerFormalArguments(
if (IsEntryFunc) {
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
+ } else {
+ // For the fixed ABI, pass workitem IDs in the last argument register.
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI)
+ allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}
if (IsKernel) {
@@ -2202,7 +2221,7 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
- if (!IsEntryFunc) {
+ if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
// Special inputs come after user arguments.
allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
}
@@ -2483,6 +2502,8 @@ void SITargetLowering::passSpecialInputs(
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
+ report_fatal_error("failed to allocate implicit input argument");
} else {
unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
@@ -2549,6 +2570,7 @@ void SITargetLowering::passSpecialInputs(
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ CCInfo.AllocateReg(OutgoingArg->getRegister());
} else {
unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4);
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
@@ -2723,12 +2745,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+ SmallVector<SDValue, 8> MemOpChains;
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
+ if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ // With a fixed ABI, allocate fixed registers before user arguments.
+ passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
+ }
+
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
@@ -2747,7 +2776,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// arguments to begin at SP+0. Completely unused for non-tail calls.
int32_t FPDiff = 0;
MachineFrameInfo &MFI = MF.getFrameInfo();
- SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
@@ -2764,7 +2792,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain = DAG.getTokenFactor(DL, CopyFromChains);
}
- SmallVector<SDValue, 8> MemOpChains;
MVT PtrVT = MVT::i32;
// Walk the register/memloc assignments, inserting copies/loads.
@@ -2860,8 +2887,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
}
- // Copy special input registers after user input arguments.
- passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
+ if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
+ // Copy special input registers after user input arguments.
+ passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
+ }
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 88541b3c59f3..a5d18f170e66 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -439,6 +439,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
+ void allocateSpecialInputVGPRsFixed(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 300b288405bb..79b74c5ede2d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -8,6 +8,7 @@
#include "SIMachineFunctionInfo.h"
#include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUTargetMachine.h"
#include "AMDGPUSubtarget.h"
#include "SIRegisterInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -60,6 +61,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
// calls.
const bool HasCalls = FrameInfo.hasCalls() || F.hasFnAttribute("amdgpu-calls");
+ // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
+ // have any calls.
+ const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI &&
+ (!isEntryFunction() || HasCalls);
+
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
if (!F.arg_empty())
KernargSegmentPtr = true;
@@ -94,23 +100,33 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
}
}
- if (F.hasFnAttribute("amdgpu-work-group-id-x"))
+ if (UseFixedABI) {
WorkGroupIDX = true;
-
- if (F.hasFnAttribute("amdgpu-work-group-id-y"))
WorkGroupIDY = true;
-
- if (F.hasFnAttribute("amdgpu-work-group-id-z"))
WorkGroupIDZ = true;
-
- if (F.hasFnAttribute("amdgpu-work-item-id-x"))
WorkItemIDX = true;
-
- if (F.hasFnAttribute("amdgpu-work-item-id-y"))
WorkItemIDY = true;
-
- if (F.hasFnAttribute("amdgpu-work-item-id-z"))
WorkItemIDZ = true;
+ ImplicitArgPtr = true;
+ } else {
+ if (F.hasFnAttribute("amdgpu-work-group-id-x"))
+ WorkGroupIDX = true;
+
+ if (F.hasFnAttribute("amdgpu-work-group-id-y"))
+ WorkGroupIDY = true;
+
+ if (F.hasFnAttribute("amdgpu-work-group-id-z"))
+ WorkGroupIDZ = true;
+
+ if (F.hasFnAttribute("amdgpu-work-item-id-x"))
+ WorkItemIDX = true;
+
+ if (F.hasFnAttribute("amdgpu-work-item-id-y"))
+ WorkItemIDY = true;
+
+ if (F.hasFnAttribute("amdgpu-work-item-id-z"))
+ WorkItemIDZ = true;
+ }
bool HasStackObjects = FrameInfo.hasStackObjects();
@@ -133,19 +149,27 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (isAmdHsaOrMesa) {
PrivateSegmentBuffer = true;
- if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
+ if (UseFixedABI) {
DispatchPtr = true;
-
- if (F.hasFnAttribute("amdgpu-queue-ptr"))
QueuePtr = true;
- if (F.hasFnAttribute("amdgpu-dispatch-id"))
+ // FIXME: We don't need this?
DispatchID = true;
+ } else {
+ if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
+ DispatchPtr = true;
+
+ if (F.hasFnAttribute("amdgpu-queue-ptr"))
+ QueuePtr = true;
+
+ if (F.hasFnAttribute("amdgpu-dispatch-id"))
+ DispatchID = true;
+ }
} else if (ST.isMesaGfxShader(F)) {
ImplicitBufferPtr = true;
}
- if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+ if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
KernargSegmentPtr = true;
if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
new file mode 100644
index 000000000000..497ea354fc09
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -0,0 +1,343 @@
+; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
+; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+
+; GCN-LABEL: {{^}}use_dispatch_ptr:
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define hidden void @use_dispatch_ptr() #1 {
+ %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+ %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+ %value = load volatile i32, i32 addrspace(4)* %header_ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_queue_ptr:
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define hidden void @use_queue_ptr() #1 {
+ %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+ %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+ %value = load volatile i32, i32 addrspace(4)* %header_ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
+; GCN: s_mov_b64 [[PTR:s\[[0-9]+:[0-9]+\]]], 0
+; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
+define hidden void @use_kernarg_segment_ptr() #1 {
+ %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+ %header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
+ %value = load volatile i32, i32 addrspace(4)* %header_ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_implicitarg_ptr:
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define hidden void @use_implicitarg_ptr() #1 {
+ %implicit.arg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+ %header_ptr = bitcast i8 addrspace(4)* %implicit.arg.ptr to i32 addrspace(4)*
+ %value = load volatile i32, i32 addrspace(4)* %header_ptr
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_dispatch_id:
+; GCN: ; use s[10:11]
+define hidden void @use_dispatch_id() #1 {
+ %id = call i64 @llvm.amdgcn.dispatch.id()
+ call void asm sideeffect "; use $0", "s"(i64 %id)
+ ret void
+}
+; GCN-LABEL: {{^}}use_workgroup_id_x:
+; GCN: s_waitcnt
+; GCN: ; use s12
+define hidden void @use_workgroup_id_x() #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_stack_workgroup_id_x:
+; GCN: s_waitcnt
+; GCN-NOT: s32
+; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; GCN: ; use s12
+; GCN: s_setpc_b64
+define hidden void @use_stack_workgroup_id_x() #1 {
+ %alloca = alloca i32, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+ %val = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_y:
+; GCN: s_waitcnt
+; GCN: ; use s13
+define hidden void @use_workgroup_id_y() #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.y()
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_z:
+; GCN: s_waitcnt
+; GCN: ; use s14
+define hidden void @use_workgroup_id_z() #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_xy:
+; GCN: ; use s12
+; GCN: ; use s13
+define hidden void @use_workgroup_id_xy() #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ call void asm sideeffect "; use $0", "s"(i32 %val0)
+ call void asm sideeffect "; use $0", "s"(i32 %val1)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_xyz:
+; GCN: ; use s12
+; GCN: ; use s13
+; GCN: ; use s14
+define hidden void @use_workgroup_id_xyz() #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
+ %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val0)
+ call void asm sideeffect "; use $0", "s"(i32 %val1)
+ call void asm sideeffect "; use $0", "s"(i32 %val2)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_xz:
+; GCN: ; use s12
+; GCN: ; use s14
+define hidden void @use_workgroup_id_xz() #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val0)
+ call void asm sideeffect "; use $0", "s"(i32 %val1)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_workgroup_id_yz:
+; GCN: ; use s13
+; GCN: ; use s14
+define hidden void @use_workgroup_id_yz() #1 {
+ %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
+ %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val0)
+ call void asm sideeffect "; use $0", "s"(i32 %val1)
+ ret void
+}
+
+; Argument is in right place already
+; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x:
+; GCN-NOT: s12
+; GCN-NOT: s13
+; GCN-NOT: s14
+; GCN: v_readlane_b32 s4, v32, 0
+define hidden void @func_indirect_use_workgroup_id_x() #1 {
+ call void @use_workgroup_id_x()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y:
+; GCN-NOT: s4
+; GCN: v_readlane_b32 s4, v32, 0
+define hidden void @func_indirect_use_workgroup_id_y() #1 {
+ call void @use_workgroup_id_y()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z:
+; GCN-NOT: s4
+; GCN: v_readlane_b32 s4, v32, 0
+define hidden void @func_indirect_use_workgroup_id_z() #1 {
+ call void @use_workgroup_id_z()
+ ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workgroup_id_x:
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN: ; use s12
+define hidden void @other_arg_use_workgroup_id_x(i32 %arg0) #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.x()
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workgroup_id_y:
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN: ; use s13
+define hidden void @other_arg_use_workgroup_id_y(i32 %arg0) #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.y()
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}other_arg_use_workgroup_id_z:
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; GCN: ; use s14
+define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
+ %val = call i32 @llvm.amdgcn.workgroup.id.z()
+ store volatile i32 %arg0, i32 addrspace(1)* undef
+ call void asm sideeffect "; use $0", "s"(i32 %val)
+ ret void
+}
+
+; GCN-LABEL: {{^}}use_every_sgpr_input:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s4
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s5
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s6
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s7
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], s8
+; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], s9
+; GCN: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+; GCN: ; use s[10:11]
+; GCN: ; use s12
+; GCN: ; use s13
+; GCN: ; use s14
+define hidden void @use_every_sgpr_input() #1 {
+ %alloca = alloca i32, align 4, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+
+ %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+ %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+ %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc
+
+ %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+ %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+ %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc
+
+ %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+ %implicitarg.ptr.bc = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
+ %val2 = load volatile i32, i32 addrspace(4)* %implicitarg.ptr.bc
+
+ %val3 = call i64 @llvm.amdgcn.dispatch.id()
+ call void asm sideeffect "; use $0", "s"(i64 %val3)
+
+ %val4 = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void asm sideeffect "; use $0", "s"(i32 %val4)
+
+ %val5 = call i32 @llvm.amdgcn.workgroup.id.y()
+ call void asm sideeffect "; use $0", "s"(i32 %val5)
+
+ %val6 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val6)
+
+ ret void
+}
+
+; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input:
+; GCN: s_mov_b32 s33, s17
+; GCN: s_mov_b32 s12, s14
+; GCN: s_mov_b32 s13, s15
+; GCN: s_mov_b32 s14, s16
+; GCN: s_mov_b32 s32, s33
+; GCN: s_swappc_b64
+
+; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
+; GCN: .amdhsa_user_sgpr_dispatch_ptr 1
+; GCN: .amdhsa_user_sgpr_queue_ptr 1
+; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 1
+; GCN: .amdhsa_user_sgpr_dispatch_id 1
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
+; GCN: .amdhsa_user_sgpr_private_segment_size 0
+; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
+; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
+; GCN: .amdhsa_system_sgpr_workgroup_id_y 1
+; GCN: .amdhsa_system_sgpr_workgroup_id_z 1
+; GCN: .amdhsa_system_sgpr_workgroup_info 0
+; GCN: .amdhsa_system_vgpr_workitem_id 2
+define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 {
+ call void @use_every_sgpr_input()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_indirect_use_every_sgpr_input:
+; GCN-NOT: s6
+; GCN-NOT: s7
+; GCN-NOT: s8
+; GCN-NOT: s9
+; GCN-NOT: s10
+; GCN-NOT: s11
+; GCN-NOT: s12
+; GCN-NOT: s13
+; GCN-NOT: s[6:7]
+; GCN-NOT: s[8:9]
+; GCN-NOT: s[10:11]
+; GCN-NOT: s[12:13]
+; GCN-NOT: s14
+; GCN: s_or_saveexec_b64 s[16:17], -1
+define hidden void @func_indirect_use_every_sgpr_input() #1 {
+ call void @use_every_sgpr_input()
+ ret void
+}
+
+; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz:
+; GCN-NOT: s12
+; GCN-NOT: s13
+; GCN-NOT: s14
+; GCN: ; use s[10:11]
+; GCN: ; use s12
+; GCN: ; use s13
+; GCN: ; use s14
+
+; GCN: s_swappc_b64
+define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
+ %alloca = alloca i32, align 4, addrspace(5)
+ store volatile i32 0, i32 addrspace(5)* %alloca
+
+ %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+ %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
+ %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc
+
+ %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+ %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
+ %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc
+
+ %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+ %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
+ %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
+
+ %val3 = call i64 @llvm.amdgcn.dispatch.id()
+ call void asm sideeffect "; use $0", "s"(i64 %val3)
+
+ %val4 = call i32 @llvm.amdgcn.workgroup.id.x()
+ call void asm sideeffect "; use $0", "s"(i32 %val4)
+
+ %val5 = call i32 @llvm.amdgcn.workgroup.id.y()
+ call void asm sideeffect "; use $0", "s"(i32 %val5)
+
+ %val6 = call i32 @llvm.amdgcn.workgroup.id.z()
+ call void asm sideeffect "; use $0", "s"(i32 %val6)
+
+ call void @use_workgroup_id_xyz()
+ ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #0
+declare i32 @llvm.amdgcn.workgroup.id.y() #0
+declare i32 @llvm.amdgcn.workgroup.id.z() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
+declare i64 @llvm.amdgcn.dispatch.id() #0
+declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
+
+attributes #0 = { nounwind readnone speculatable }
+attributes #1 = { nounwind noinline }
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index e47390b48751..421d41294a28 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -1,8 +1,10 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VARABI %s
+; RUN: llc -amdgpu-fixed-function-abi -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIXEDABI %s
; GCN-LABEL: {{^}}use_workitem_id_x:
; GCN: s_waitcnt
-; GCN: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
+; VARABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v0
+; FIXEDABI: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@@ -14,7 +16,8 @@ define void @use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}use_workitem_id_y:
; GCN: s_waitcnt
-; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
+; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 10, 10
+; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@@ -26,7 +29,8 @@ define void @use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}use_workitem_id_z:
; GCN: s_waitcnt
-; GCN: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
+; VARABI: v_bfe_u32 [[ID:v[0-9]+]], v0, 20, 10
+; FIXEDABI: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[ID]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
@@ -38,8 +42,12 @@ define void @use_workitem_id_z() #1 {
; GCN-LABEL: {{^}}use_workitem_id_xy:
; GCN: s_waitcnt
-; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
-; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+
+; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-NEXT: s_waitcnt
@@ -54,9 +62,16 @@ define void @use_workitem_id_xy() #1 {
; GCN-LABEL: {{^}}use_workitem_id_xyz:
; GCN: s_waitcnt
-; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
-; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
-; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+
+; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+
+; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
+
+
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
@@ -74,8 +89,12 @@ define void @use_workitem_id_xyz() #1 {
; GCN-LABEL: {{^}}use_workitem_id_xz:
; GCN: s_waitcnt
-; GCN-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
-; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; VARABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v0
+; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+
+; FIXEDABI-DAG: v_and_b32_e32 [[IDX:v[0-9]+]], 0x3ff, v31
+; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
+
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDX]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
@@ -90,8 +109,12 @@ define void @use_workitem_id_xz() #1 {
; GCN-LABEL: {{^}}use_workitem_id_yz:
; GCN: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
-; GCN-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+; VARABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v0, 10, 10
+; VARABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v0, 20, 10
+
+; FIXEDABI-DAG: v_bfe_u32 [[IDY:v[0-9]+]], v31, 10, 10
+; FIXEDABI-DAG: v_bfe_u32 [[IDZ:v[0-9]+]], v31, 20, 10
+
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDY]]
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]:[0-9]+\]}}, [[IDZ]]
; GCN-NEXT: s_waitcnt
@@ -105,24 +128,39 @@ define void @use_workitem_id_yz() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_x:
-; GCN: enable_vgpr_workitem_id = 0
+; VARABI: enable_vgpr_workitem_id = 0
+; FIXEDABI: enable_vgpr_workitem_id = 2
-; GCN-NOT: v0
+; FIXEDA-NOT: v0
+; VARABI-NOT: v31
; GCN: s_swappc_b64
-; GCN-NOT: v0
+; FIXEDABI-NOT: v0
+; VARABI-NOT: v31
define amdgpu_kernel void @kern_indirect_use_workitem_id_x() #1 {
call void @use_workitem_id_x()
ret void
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_y:
-; GCN: enable_vgpr_workitem_id = 1
+; VARABI: enable_vgpr_workitem_id = 1
+; FIXEDABI: enable_vgpr_workitem_id = 2
+
+; FIXEDABI-NOT: v0
+; FIXEDABI-NOT: v1
+
+; VARABI-NOT: v31
+; VARABI: v_lshlrev_b32_e32 v0, 10, v1
+
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+
+; FIXEDABI-NOT: v0
+; FIXEDABI-NOT: v1
+; VARABI-NOT: v31
-; GCN-NOT: v0
-; GCN-NOT: v1
-; GCN: v_lshlrev_b32_e32 v0, 10, v1
-; GCN-NOT: v0
-; GCN-NOT: v1
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
call void @use_workitem_id_y()
@@ -132,11 +170,17 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_z:
; GCN: enable_vgpr_workitem_id = 2
-; GCN-NOT: v0
-; GCN-NOT: v2
-; GCN: v_lshlrev_b32_e32 v0, 20, v2
-; GCN-NOT: v0
-; GCN-NOT: v1
+; VARABI-NOT: v0
+; VARABI-NOT: v2
+; VARABI: v_lshlrev_b32_e32 v0, 20, v2
+; VARABI-NOT: v0
+; VARABI-NOT: v1
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
call void @use_workitem_id_z()
@@ -144,12 +188,18 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_z() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xy:
-; GCN-NOT: v0
-; GCN-NOT: v1
-; GCN: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
-; GCN: v_or_b32_e32 v0, v0, [[IDY]]
-; GCN-NOT: v0
-; GCN-NOT: v1
+; VARABI-NOT: v0
+; VARABI-NOT: v1
+; VARABI: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; VARABI: v_or_b32_e32 v0, v0, [[IDY]]
+; VARABI-NOT: v0
+; VARABI-NOT: v1
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
call void @use_workitem_id_xy()
@@ -157,12 +207,19 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xy() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xz:
-; GCN-NOT: v0
-; GCN-NOT: v2
-; GCN: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 v0, v0, [[IDZ]]
-; GCN-NOT: v0
-; GCN-NOT: v2
+; VARABI-NOT: v0
+; VARABI-NOT: v2
+; VARABI: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; VARABI: v_or_b32_e32 v0, v0, [[IDZ]]
+; VARABI-NOT: v0
+; VARABI-NOT: v2
+
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
call void @use_workitem_id_xz()
@@ -170,13 +227,20 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_xz() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_yz:
-; GCN-NOT: v1
-; GCN-NOT: v2
-; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
-; GCN: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
-; GCN-NOT: v1
-; GCN-NOT: v2
+; VARABI-NOT: v1
+; VARABI-NOT: v2
+; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; VARABI: v_or_b32_e32 v0, [[IDY]], [[IDZ]]
+; VARABI-NOT: v1
+; VARABI-NOT: v2
+
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
call void @use_workitem_id_yz()
@@ -184,16 +248,22 @@ define amdgpu_kernel void @kern_indirect_use_workitem_id_yz() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workitem_id_xyz:
-; GCN-NOT: v0
-; GCN-NOT: v1
-; GCN-NOT: v2
-; GCN-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
-; GCN-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
-; GCN-DAG: v_or_b32_e32 v0, v0, [[IDY]]
-; GCN-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
-; GCN-NOT: v0
-; GCN-NOT: v1
-; GCN-NOT: v2
+; VARABI-NOT: v0
+; VARABI-NOT: v1
+; VARABI-NOT: v2
+; VARABI-DAG: v_lshlrev_b32_e32 [[IDY:v[0-9]+]], 10, v1
+; VARABI-DAG: v_lshlrev_b32_e32 [[IDZ:v[0-9]+]], 20, v2
+; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDY]]
+; VARABI-DAG: v_or_b32_e32 v0, v0, [[IDZ]]
+; VARABI-NOT: v0
+; VARABI-NOT: v1
+; VARABI-NOT: v2
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_use_workitem_id_xyz() #1 {
call void @use_workitem_id_xyz()
@@ -229,7 +299,9 @@ define void @func_indirect_use_workitem_id_z() #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_x:
; GCN: s_waitcnt
-; GCN-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
+; VARABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v1
+; FIXEDABI-DAG: v_and_b32_e32 [[ID:v[0-9]+]], 0x3ff, v31
+
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
@@ -241,7 +313,8 @@ define void @other_arg_use_workitem_id_x(i32 %arg0) #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_y:
; GCN: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
+; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 10, 10
+; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 10, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
@@ -253,7 +326,8 @@ define void @other_arg_use_workitem_id_y(i32 %arg0) #1 {
; GCN-LABEL: {{^}}other_arg_use_workitem_id_z:
; GCN: s_waitcnt
-; GCN-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
+; VARABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v1, 20, 10
+; FIXEDABI-DAG: v_bfe_u32 [[ID:v[0-9]+]], v31, 20, 10
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
; GCN-DAG: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[ID]]
define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
@@ -265,10 +339,17 @@ define void @other_arg_use_workitem_id_z(i32 %arg0) #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_x:
-; GCN: enable_vgpr_workitem_id = 0
+; VARABI: enable_vgpr_workitem_id = 0
+; FIXEDABI: enable_vgpr_workitem_id = 2
+
+; VARABI: v_mov_b32_e32 v1, v0
+; VARABI: v_mov_b32_e32 v0, 0x22b
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
-; GCN: v_mov_b32_e32 v1, v0
-; GCN: v_mov_b32_e32 v0, 0x22b
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
call void @other_arg_use_workitem_id_x(i32 555)
@@ -277,14 +358,20 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_y:
-; GCN: enable_vgpr_workitem_id = 1
-
-; GCN: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NOT: v1
-; GCN: v_mov_b32_e32 v0, 0x22b
-; GCN-NOT: v1
-; GCN: s_swappc_b64
-; GCN-NOT: v0
+; VARABI: enable_vgpr_workitem_id = 1
+
+; VARABI: v_lshlrev_b32_e32 v1, 10, v1
+; VARABI-NOT: v1
+; VARABI: v_mov_b32_e32 v0, 0x22b
+; VARABI-NOT: v1
+; VARABI: s_swappc_b64
+; VARABI-NOT: v0
+
+; FIXEDABI: enable_vgpr_workitem_id = 2
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
call void @other_arg_use_workitem_id_y(i32 555)
ret void
@@ -293,24 +380,33 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workitem_id_z:
; GCN: enable_vgpr_workitem_id = 2
-; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: v_lshlrev_b32_e32 v1, 20, v2
-; GCN: s_swappc_b64
-; GCN-NOT: v0
+; VARABI-DAG: v_mov_b32_e32 v0, 0x22b
+; VARABI-DAG: v_lshlrev_b32_e32 v1, 20, v2
+; VARABI: s_swappc_b64
+; VARABI-NOT: v0
+
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
call void @other_arg_use_workitem_id_z(i32 555)
ret void
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
-; GCN: v_and_b32_e32 v32, 0x3ff, v32
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
+; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; VARABI: v_and_b32_e32 v32, 0x3ff, v32
+; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
+; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VARABI-NEXT: s_waitcnt
+; VARABI-NEXT: s_setpc_b64
+
+; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
+; FIXEDABI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
define void @too_many_args_use_workitem_id_x(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
@@ -359,12 +455,25 @@ define void @too_many_args_use_workitem_id_x(
}
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x:
-; GCN: enable_vgpr_workitem_id = 0
+; VARABI: enable_vgpr_workitem_id = 0
-; GCN: s_mov_b32 s33, s7
-; GCN: s_mov_b32 s32, s33
-; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
-; GCN: s_swappc_b64
+; VARABI: s_mov_b32 s33, s7
+; VARABI: s_mov_b32 s32, s33
+; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; VARABI: s_swappc_b64
+
+
+; FIXEDABI: enable_vgpr_workitem_id = 2
+; FIXEDABI: s_mov_b32 s33, s17
+; FIXEDABI-DAG: s_mov_b32 s32, s33
+; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
+
+; FIXEDABI: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
call void @too_many_args_use_workitem_id_x(
i32 10, i32 20, i32 30, i32 40,
@@ -379,8 +488,16 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
-; GCN: s_mov_b32 s34, s32
-; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
+; VARABI: s_mov_b32 s34, s32
+; VARABI: buffer_store_dword v1, off, s[0:3], s32{{$}}
+
+; Touching the workitem id register is not necessary.
+; FIXEDABI-NOT: v31
+; FIXEDABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}}
+; FIXEDABI-NOT: v31
+; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
+; FIXEDABI-NOT: v31
+
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
store volatile i32 %arg0, i32 addrspace(1)* undef
@@ -422,20 +539,29 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
ret void
}
-; stack layout:
+; var abi stack layout:
; frame[0] = byval arg32
; frame[1] = stack passed workitem ID x
; frame[2] = VGPR spill slot
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: v_and_b32_e32 v32, 0x3ff, v32
-; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
-; GCN: buffer_load_dword v0, off, s[0:3], s32{{$}}
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN: s_setpc_b64
+; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VARABI-NEXT: s_waitcnt
+; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32
+; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
+; VARABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
+; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VARABI: s_setpc_b64
+
+
+; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
+; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v31
+
+; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
+; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
+; FIXEDABI: buffer_load_dword v0, off, s[0:3], s32 offset:4{{$}}
+; FIXEDABI: s_setpc_b64
define void @too_many_args_use_workitem_id_x_byval(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
@@ -483,24 +609,46 @@ define void @too_many_args_use_workitem_id_x_byval(
ret void
}
+; var abi stack layout:
; sp[0] = byval
; sp[1] = ??
; sp[2] = stack passed workitem ID x
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval:
-; GCN: enable_vgpr_workitem_id = 0
-; GCN-DAG: s_mov_b32 s33, s7
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
-; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
-; GCN: s_add_u32 s32, s33, 0x400{{$}}
-
-; GCN-NOT: s32
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
-
-; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
-; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
-; GCN: s_swappc_b64
+; VARABI: enable_vgpr_workitem_id = 0
+; VARABI-DAG: s_mov_b32 s33, s7
+; VARABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; VARABI: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
+; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
+; VARABI: s_add_u32 s32, s33, 0x400{{$}}
+
+; VARABI-NOT: s32
+; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
+
+; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
+; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
+; VARABI: s_swappc_b64
+
+
+; FIXEDABI: s_mov_b32 s33, s17
+; FIXEDABI-DAG: s_add_u32 s32, s33, 0x400
+; FIXEDABI-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33 offset:4{{$}}
+
+; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140
+; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
+
+; FIXME: Why this reload?
+; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], s33 offset:4{{$}}
+
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; FIXEDABI-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; FIXEDABI: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+
+; FIXEDABI-NOT: s32
+; FIXEDABI: buffer_store_dword [[RELOAD]], off, s[0:3], s32 offset:4
+; FIXEDABI: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 999, i32 addrspace(5)* %alloca
@@ -518,13 +666,27 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
}
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
-; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}}
-; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}}
-; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
-; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
-; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
-; GCN: s_swappc_b64
+; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
+; VARABI: buffer_store_dword [[K]], off, s[0:3], s34{{$}}
+; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}}
+; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4
+; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}}
+; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]],
+; VARABI: s_swappc_b64
+
+
+; FIXED-ABI-NOT: v31
+; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}}
+; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s34{{$}}
+
+; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}}
+; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}}
+; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}}
+
+; FIXED-ABI-NOT: v31
+; FIXEDABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
+; FIXED-ABI-NOT: v31
+; FIXEDABI: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 999, i32 addrspace(5)* %alloca
@@ -543,25 +705,36 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; Only one stack load should be emitted for all 3 values.
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GCN-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
-; GCN: buffer_load_dword v32, off, s[0:3], s32{{$}}
-; GCN-NOT: buffer_load_dword
-
-; GCN: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32
-; GCN-NOT: buffer_load_dword
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
-; GCN-NOT: buffer_load_dword
-; GCN: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10
-; GCN-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10
-; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
-; GCN-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
-
-; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
+; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; VARABI: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
+; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
+; VARABI-NOT: buffer_load_dword
+
+; VARABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v32
+; VARABI-NOT: buffer_load_dword
+; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
+; VARABI-NOT: buffer_load_dword
+; VARABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v32, 10, 10
+; VARABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v32, 20, 10
+; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
+; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
+
+; VARABI: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; VARABI-NEXT: s_waitcnt
+; VARABI-NEXT: s_setpc_b64
+
+
+; FIXEDABI: v_and_b32_e32 [[AND_X:v[0-9]+]], 0x3ff, v31
+; FIXEDABI-NOT: buffer_load_dword
+; FIXEDABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[AND_X]]
+; FIXEDABI-NOT: buffer_load_dword
+; FIXEDABI: v_bfe_u32 [[BFE_Y:v[0-9]+]], v31, 10, 10
+; FIXEDABI-NEXT: v_bfe_u32 [[BFE_Z:v[0-9]+]], v31, 20, 10
+; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
+; FIXEDABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
+
define void @too_many_args_use_workitem_id_xyz(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
@@ -613,19 +786,23 @@ define void @too_many_args_use_workitem_id_xyz(
ret void
}
-; frame[0] = ID { Z, Y, X }
-
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
; GCN: enable_vgpr_workitem_id = 2
-; GCN-DAG: s_mov_b32 s33, s7
+; VARABI-DAG: s_mov_b32 s33, s7
+; FIXEDABI-DAG: s_mov_b32 s33, s17
; GCN-DAG: s_mov_b32 s32, s33
-; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
-; GCN-DAG: v_or_b32_e32 v0, v0, v1
-; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2
-; GCN-DAG: v_or_b32_e32 v0, v0, v2
-; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1
+; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2
+; GCN-DAG: v_or_b32_e32 [[TMP2:v[0-9]+]], v0, [[TMP1]]
+; VARABI-DAG: v_or_b32_e32 [[PACKEDID:v[0-9]+]], [[TMP2]], [[TMP0]]
+; VARABI: buffer_store_dword [[PACKEDID]], off, s[0:3], s32{{$}}
+
+; FIXEDABI-DAG: v_or_b32_e32 v31, [[TMP2]], [[TMP0]]
+; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140
+; FIXEDABI: buffer_store_dword [[K]], off, s[0:3], s32{{$}}
+
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
call void @too_many_args_use_workitem_id_xyz(
@@ -640,7 +817,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
ret void
}
-; workitem ID X in register, yz on stack
+; Var abi: workitem ID X in register, yz on stack
; v31 = workitem ID X
; frame[0] = workitem { Z, Y, X }
@@ -708,7 +885,8 @@ define void @too_many_args_use_workitem_id_x_stack_yz(
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
; GCN: enable_vgpr_workitem_id = 2
-; GCN: s_mov_b32 s33, s7
+; VARABI: s_mov_b32 s33, s7
+; FIXEDABI: s_mov_b32 s33, s17
; GCN-NOT: v0
; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1
More information about the llvm-commits
mailing list