<div dir="ltr">Hi,<div><br></div><div>MSan is not happy about this change, but the report in unfortunately truncated:</div><div><a href="http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-bootstrap-msan/builds/5926/steps/check-llvm%20msan/logs/stdio">http://lab.llvm.org:8011/builders/sanitizer-x86_64-linux-bootstrap-msan/builds/5926/steps/check-llvm%20msan/logs/stdio</a><br></div></div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Jul 13, 2018 at 9:40 AM, Matt Arsenault via llvm-commits <span dir="ltr"><<a href="mailto:llvm-commits@lists.llvm.org" target="_blank">llvm-commits@lists.llvm.org</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Author: arsenm<br>
Date: Fri Jul 13 09:40:25 2018<br>
New Revision: 337021<br>
<br>
URL: <a href="http://llvm.org/viewvc/llvm-project?rev=337021&view=rev" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project?rev=337021&view=rev</a><br>
Log:<br>
AMDGPU: Fix handling of alignment padding in DAG argument lowering<br>
<br>
This was completely broken if there was ever a struct argument, as<br>
this information is thrown away during the argument analysis.<br>
<br>
The offsets as passed in to LowerFormalArguments are not useful,<br>
as they partially depend on the legalized result register type,<br>
and they don't consider the alignment in the first place.<br>
<br>
Ignore the Ins array, and instead figure out from the raw IR type<br>
what we need to do. This seems to fix the padding computation<br>
if the DAG lowering is forced (and stops breaking arguments<br>
following padded arguments if the arguments were only partially<br>
lowered in the IR)<br>
<br>
Added:<br>
llvm/trunk/test/CodeGen/<wbr>AMDGPU/kernel-argument-dag-<wbr>lowering.ll<br>
Modified:<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUAsmPrinter.cpp<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUHSAMetadataStreamer.cpp<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUISelLowering.cpp<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUISelLowering.h<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPULowerKernelArguments.cpp<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUMachineFunction.cpp<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUMachineFunction.h<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUSubtarget.cpp<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUSubtarget.h<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>R600.td<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>R600ISelLowering.cpp<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>SIISelLowering.cpp<br>
llvm/trunk/lib/Target/AMDGPU/<wbr>SIMachineFunctionInfo.cpp<br>
llvm/trunk/test/CodeGen/<wbr>AMDGPU/kernel-args.ll<br>
llvm/trunk/test/CodeGen/<wbr>AMDGPU/llvm.amdgcn.<wbr>implicitarg.ptr.ll<br>
llvm/trunk/test/CodeGen/<wbr>AMDGPU/llvm.amdgcn.kernarg.<wbr>segment.ptr.ll<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUAsmPrinter.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/AMDGPUAsmPrinter.cpp?<wbr>rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUAsmPrinter.cpp (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUAsmPrinter.cpp Fri Jul 13 09:40:25 2018<br>
@@ -1128,6 +1128,13 @@ static amd_element_byte_size_t getElemen<br>
void AMDGPUAsmPrinter::<wbr>getAmdKernelCode(amd_kernel_<wbr>code_t &Out,<br>
const SIProgramInfo &CurrentProgramInfo,<br>
const MachineFunction &MF) const {<br>
+ const Function &F = MF.getFunction();<br>
+<br>
+ // Avoid asserting on erroneous cases.<br>
+ if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&<br>
+ F.getCallingConv() != CallingConv::SPIR_KERNEL)<br>
+ return;<br>
+<br>
const SIMachineFunctionInfo *MFI = MF.getInfo<<wbr>SIMachineFunctionInfo>();<br>
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>(<wbr>);<br>
<br>
@@ -1174,9 +1181,8 @@ void AMDGPUAsmPrinter::<wbr>getAmdKernelCode(<br>
if (STM.isXNACKEnabled())<br>
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_<wbr>SUPPORTED;<br>
<br>
- // FIXME: Should use getKernArgSize<br>
- Out.kernarg_segment_byte_size =<br>
- STM.getKernArgSegmentSize(MF.<wbr>getFunction(), MFI->getExplicitKernArgSize())<wbr>;<br>
+ unsigned MaxKernArgAlign;<br>
+ Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);<br>
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;<br>
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;<br>
Out.workitem_private_segment_<wbr>byte_size = CurrentProgramInfo.<wbr>ScratchSize;<br>
@@ -1185,7 +1191,7 @@ void AMDGPUAsmPrinter::<wbr>getAmdKernelCode(<br>
// These alignment values are specified in powers of two, so alignment =<br>
// 2^n. The minimum alignment is 2^4 = 16.<br>
Out.kernarg_segment_alignment = std::max((size_t)4,<br>
- countTrailingZeros(MFI-><wbr>getMaxKernArgAlign()));<br>
+ countTrailingZeros(<wbr>MaxKernArgAlign));<br>
<br>
if (STM.debuggerEmitPrologue()) {<br>
Out.debug_wavefront_private_<wbr>segment_offset_sgpr =<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUHSAMetadataStreamer.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/<wbr>AMDGPUHSAMetadataStreamer.cpp?<wbr>rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUHSAMetadataStreamer.cpp (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUHSAMetadataStreamer.cpp Fri Jul 13 09:40:25 2018<br>
@@ -209,15 +209,16 @@ Kernel::CodeProps::Metadata MetadataStre<br>
const Function &F = MF.getFunction();<br>
<br>
// Avoid asserting on erroneous cases.<br>
- if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL)<br>
+ if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&<br>
+ F.getCallingConv() != CallingConv::SPIR_KERNEL)<br>
return HSACodeProps;<br>
<br>
- HSACodeProps.<wbr>mKernargSegmentSize =<br>
- STM.getKernArgSegmentSize(F, MFI.getExplicitKernArgSize());<br>
+ unsigned MaxKernArgAlign;<br>
+ HSACodeProps.<wbr>mKernargSegmentSize = STM.getKernArgSegmentSize(F,<br>
+ MaxKernArgAlign);<br>
HSACodeProps.<wbr>mGroupSegmentFixedSize = ProgramInfo.LDSSize;<br>
HSACodeProps.<wbr>mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;<br>
- HSACodeProps.<wbr>mKernargSegmentAlign =<br>
- std::max(uint32_t(4), MFI.getMaxKernArgAlign());<br>
+ HSACodeProps.<wbr>mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);<br>
HSACodeProps.mWavefrontSize = STM.getWavefrontSize();<br>
HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;<br>
HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUISelLowering.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/AMDGPUISelLowering.cpp?<wbr>rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUISelLowering.cpp (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUISelLowering.cpp Fri Jul 13 09:40:25 2018<br>
@@ -30,6 +30,7 @@<br>
#include "SIInstrInfo.h"<br>
#include "SIMachineFunctionInfo.h"<br>
#include "MCTargetDesc/<wbr>AMDGPUMCTargetDesc.h"<br>
+#include "llvm/CodeGen/Analysis.h"<br>
#include "llvm/CodeGen/<wbr>CallingConvLower.h"<br>
#include "llvm/CodeGen/MachineFunction.<wbr>h"<br>
#include "llvm/CodeGen/<wbr>MachineRegisterInfo.h"<br>
@@ -40,18 +41,6 @@<br>
#include "llvm/Support/KnownBits.h"<br>
using namespace llvm;<br>
<br>
-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,<br>
- CCValAssign::LocInfo LocInfo,<br>
- ISD::ArgFlagsTy ArgFlags, CCState &State) {<br>
- MachineFunction &MF = State.getMachineFunction();<br>
- AMDGPUMachineFunction *MFI = MF.getInfo<<wbr>AMDGPUMachineFunction>();<br>
-<br>
- uint64_t Offset = MFI->allocateKernArg(LocVT.<wbr>getStoreSize(),<br>
- ArgFlags.getOrigAlign());<br>
- State.addLoc(CCValAssign::<wbr>getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));<br>
- return true;<br>
-}<br>
-<br>
static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,<br>
CCValAssign::LocInfo LocInfo,<br>
ISD::ArgFlagsTy ArgFlags, CCState &State,<br>
@@ -910,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssign<br>
/// for each individual part is i8. We pass the memory type as LocVT to the<br>
/// calling convention analysis function and the register type (Ins[x].VT) as<br>
/// the ValVT.<br>
-void AMDGPUTargetLowering::<wbr>analyzeFormalArgumentsCompute(<wbr>CCState &State,<br>
- const SmallVectorImpl<ISD::InputArg> &Ins) const {<br>
- for (unsigned i = 0, e = Ins.size(); i != e; ++i) {<br>
- const ISD::InputArg &In = Ins[i];<br>
- EVT MemVT;<br>
-<br>
- unsigned NumRegs = getNumRegisters(State.<wbr>getContext(), In.ArgVT);<br>
-<br>
- if (!Subtarget->isAmdHsaOS() &&<br>
- (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {<br>
- // The ABI says the caller will extend these values to 32-bits.<br>
- MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;<br>
- } else if (NumRegs == 1) {<br>
- // This argument is not split, so the IR type is the memory type.<br>
- assert(!In.Flags.isSplit());<br>
- if (In.ArgVT.isExtended()) {<br>
- // We have an extended type, like i24, so we should just use the register type<br>
- MemVT = In.VT;<br>
+void AMDGPUTargetLowering::<wbr>analyzeFormalArgumentsCompute(<br>
+ CCState &State,<br>
+ const SmallVectorImpl<ISD::InputArg> &Ins) const {<br>
+ const MachineFunction &MF = State.getMachineFunction();<br>
+ const Function &Fn = MF.getFunction();<br>
+ LLVMContext &Ctx = Fn.getParent()->getContext();<br>
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);<br>
+ const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(<wbr>Fn);<br>
+<br>
+ unsigned MaxAlign = 1;<br>
+ uint64_t ExplicitArgOffset = 0;<br>
+ const DataLayout &DL = Fn.getParent()->getDataLayout(<wbr>);<br>
+<br>
+ unsigned InIndex = 0;<br>
+<br>
+ for (const Argument &Arg : Fn.args()) {<br>
+ Type *BaseArgTy = Arg.getType();<br>
+ unsigned Align = DL.getABITypeAlignment(<wbr>BaseArgTy);<br>
+ MaxAlign = std::max(Align, MaxAlign);<br>
+ unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy)<wbr>;<br>
+<br>
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;<br>
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;<br>
+<br>
+ // We're basically throwing away everything passed into us and starting over<br>
+ // to get accurate in-memory offsets. The "PartOffset" is completely useless<br>
+ // to us as computed in Ins.<br>
+ //<br>
+ // We also need to figure out what type legalization is trying to do to get<br>
+ // the correct memory offsets.<br>
+<br>
+ SmallVector<EVT, 16> ValueVTs;<br>
+ SmallVector<uint64_t, 16> Offsets;<br>
+ ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);<br>
+<br>
+ for (unsigned Value = 0, NumValues = ValueVTs.size();<br>
+ Value != NumValues; ++Value) {<br>
+ uint64_t BasePartOffset = Offsets[Value];<br>
+<br>
+ EVT ArgVT = ValueVTs[Value];<br>
+ EVT MemVT = ArgVT;<br>
+ MVT RegisterVT =<br>
+ getRegisterTypeForCallingConv(<wbr>Ctx, ArgVT);<br>
+ unsigned NumRegs =<br>
+ getNumRegistersForCallingConv(<wbr>Ctx, ArgVT);<br>
+<br>
+ if (!Subtarget->isAmdHsaOS() &&<br>
+ (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) {<br>
+ // The ABI says the caller will extend these values to 32-bits.<br>
+ MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32;<br>
+ } else if (NumRegs == 1) {<br>
+ // This argument is not split, so the IR type is the memory type.<br>
+ if (ArgVT.isExtended()) {<br>
+ // We have an extended type, like i24, so we should just use the<br>
+ // register type.<br>
+ MemVT = RegisterVT;<br>
+ } else {<br>
+ MemVT = ArgVT;<br>
+ }<br>
+ } else if (ArgVT.isVector() && RegisterVT.isVector() &&<br>
+ ArgVT.getScalarType() == RegisterVT.getScalarType()) {<br>
+ assert(ArgVT.<wbr>getVectorNumElements() > RegisterVT.<wbr>getVectorNumElements());<br>
+ // We have a vector value which has been split into a vector with<br>
+ // the same scalar type, but fewer elements. This should handle<br>
+ // all the floating-point vector types.<br>
+ MemVT = RegisterVT;<br>
+ } else if (ArgVT.isVector() &&<br>
+ ArgVT.getVectorNumElements() == NumRegs) {<br>
+ // This arg has been split so that each element is stored in a separate<br>
+ // register.<br>
+ MemVT = ArgVT.getScalarType();<br>
+ } else if (ArgVT.isExtended()) {<br>
+ // We have an extended type, like i65.<br>
+ MemVT = RegisterVT;<br>
} else {<br>
- MemVT = In.ArgVT;<br>
+ unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;<br>
+ assert(ArgVT.<wbr>getStoreSizeInBits() % NumRegs == 0);<br>
+ if (RegisterVT.isInteger()) {<br>
+ MemVT = EVT::getIntegerVT(State.<wbr>getContext(), MemoryBits);<br>
+ } else if (RegisterVT.isVector()) {<br>
+ assert(!RegisterVT.<wbr>getScalarType().<wbr>isFloatingPoint());<br>
+ unsigned NumElements = RegisterVT.<wbr>getVectorNumElements();<br>
+ assert(MemoryBits % NumElements == 0);<br>
+ // This vector type has been split into another vector type with<br>
+ // a different elements size.<br>
+ EVT ScalarVT = EVT::getIntegerVT(State.<wbr>getContext(),<br>
+ MemoryBits / NumElements);<br>
+ MemVT = EVT::getVectorVT(State.<wbr>getContext(), ScalarVT, NumElements);<br>
+ } else {<br>
+ llvm_unreachable("cannot deduce memory type.");<br>
+ }<br>
}<br>
- } else if (In.ArgVT.isVector() && In.VT.isVector() &&<br>
- In.ArgVT.getScalarType() == In.VT.getScalarType()) {<br>
- assert(In.ArgVT.<wbr>getVectorNumElements() > In.VT.getVectorNumElements());<br>
- // We have a vector value which has been split into a vector with<br>
- // the same scalar type, but fewer elements. This should handle<br>
- // all the floating-point vector types.<br>
- MemVT = In.VT;<br>
- } else if (In.ArgVT.isVector() &&<br>
- In.ArgVT.getVectorNumElements(<wbr>) == NumRegs) {<br>
- // This arg has been split so that each element is stored in a separate<br>
- // register.<br>
- MemVT = In.ArgVT.getScalarType();<br>
- } else if (In.ArgVT.isExtended()) {<br>
- // We have an extended type, like i65.<br>
- MemVT = In.VT;<br>
- } else {<br>
- unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;<br>
- assert(In.ArgVT.<wbr>getStoreSizeInBits() % NumRegs == 0);<br>
- if (In.VT.isInteger()) {<br>
- MemVT = EVT::getIntegerVT(State.<wbr>getContext(), MemoryBits);<br>
- } else if (In.VT.isVector()) {<br>
- assert(!In.VT.getScalarType().<wbr>isFloatingPoint());<br>
- unsigned NumElements = In.VT.getVectorNumElements();<br>
- assert(MemoryBits % NumElements == 0);<br>
- // This vector type has been split into another vector type with<br>
- // a different elements size.<br>
- EVT ScalarVT = EVT::getIntegerVT(State.<wbr>getContext(),<br>
- MemoryBits / NumElements);<br>
- MemVT = EVT::getVectorVT(State.<wbr>getContext(), ScalarVT, NumElements);<br>
- } else {<br>
- llvm_unreachable("cannot deduce memory type.");<br>
+<br>
+ // Convert one element vectors to scalar.<br>
+ if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)<br>
+ MemVT = MemVT.getScalarType();<br>
+<br>
+ if (MemVT.isExtended()) {<br>
+ // This should really only happen if we have vec3 arguments<br>
+ assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);<br>
+ MemVT = MemVT.getPow2VectorType(State.<wbr>getContext());<br>
}<br>
- }<br>
<br>
- // Convert one element vectors to scalar.<br>
- if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)<br>
- MemVT = MemVT.getScalarType();<br>
-<br>
- if (MemVT.isExtended()) {<br>
- // This should really only happen if we have vec3 arguments<br>
- assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);<br>
- MemVT = MemVT.getPow2VectorType(State.<wbr>getContext());<br>
+ unsigned PartOffset = 0;<br>
+ for (unsigned i = 0; i != NumRegs; ++i) {<br>
+ State.addLoc(CCValAssign::<wbr>getCustomMem(InIndex++, RegisterVT,<br>
+ BasePartOffset + PartOffset,<br>
+ MemVT.getSimpleVT(),<br>
+ CCValAssign::Full));<br>
+ PartOffset += MemVT.getStoreSize();<br>
+ }<br>
}<br>
-<br>
- assert(MemVT.isSimple());<br>
- allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,<br>
- State);<br>
}<br>
}<br>
<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUISelLowering.h<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/AMDGPUISelLowering.h?<wbr>rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUISelLowering.h (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUISelLowering.h Fri Jul 13 09:40:25 2018<br>
@@ -122,8 +122,11 @@ protected:<br>
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;<br>
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG,<br>
SmallVectorImpl<SDValue> &Results) const;<br>
- void analyzeFormalArgumentsCompute(<wbr>CCState &State,<br>
- const SmallVectorImpl<ISD::InputArg> &Ins) const;<br>
+<br>
+ void analyzeFormalArgumentsCompute(<br>
+ CCState &State,<br>
+ const SmallVectorImpl<ISD::InputArg> &Ins) const;<br>
+<br>
public:<br>
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);<br>
<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPULowerKernelArguments.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/<wbr>AMDGPULowerKernelArguments.<wbr>cpp?rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPULowerKernelArguments.cpp (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPULowerKernelArguments.cpp Fri Jul 13 09:40:25 2018<br>
@@ -77,8 +77,9 @@ bool AMDGPULowerKernelArguments::<wbr>runOnFu<br>
const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary<br>
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(<wbr>F);<br>
<br>
+ unsigned MaxAlign;<br>
// FIXME: Alignment is broken broken with explicit arg offset.;<br>
- const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F);<br>
+ const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);<br>
if (TotalKernArgSize == 0)<br>
return false;<br>
<br>
@@ -91,13 +92,11 @@ bool AMDGPULowerKernelArguments::<wbr>runOnFu<br>
Attribute::<wbr>getWithDereferenceableBytes(<wbr>Ctx, TotalKernArgSize));<br>
<br>
unsigned AS = KernArgSegment->getType()-><wbr>getPointerAddressSpace();<br>
- unsigned MaxAlign = 1;<br>
uint64_t ExplicitArgOffset = 0;<br>
<br>
for (Argument &Arg : F.args()) {<br>
Type *ArgTy = Arg.getType();<br>
unsigned Align = DL.getABITypeAlignment(ArgTy);<br>
- MaxAlign = std::max(Align, MaxAlign);<br>
unsigned Size = DL.getTypeSizeInBits(ArgTy);<br>
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);<br>
<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUMachineFunction.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/AMDGPUMachineFunction.<wbr>cpp?rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUMachineFunction.cpp (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUMachineFunction.cpp Fri Jul 13 09:40:25 2018<br>
@@ -24,16 +24,23 @@ AMDGPUMachineFunction::<wbr>AMDGPUMachineFunc<br>
NoSignedZerosFPMath(MF.<wbr>getTarget().Options.<wbr>NoSignedZerosFPMath),<br>
MemoryBound(false),<br>
WaveLimiter(false) {<br>
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);<br>
+<br>
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,<br>
// except reserved size is not correctly aligned.<br>
+ const Function &F = MF.getFunction();<br>
<br>
if (auto *Resolver = MF.getMMI().getResolver()) {<br>
if (AMDGPUPerfHintAnalysis *PHA = static_cast<<wbr>AMDGPUPerfHintAnalysis*>(<br>
Resolver-><wbr>getAnalysisIfAvailable(&<wbr>AMDGPUPerfHintAnalysisID, true))) {<br>
- MemoryBound = PHA->isMemoryBound(&MF.<wbr>getFunction());<br>
- WaveLimiter = PHA->needsWaveLimiter(&MF.<wbr>getFunction());<br>
+ MemoryBound = PHA->isMemoryBound(&F);<br>
+ WaveLimiter = PHA->needsWaveLimiter(&F);<br>
}<br>
}<br>
+<br>
+ CallingConv::ID CC = F.getCallingConv();<br>
+ if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)<br>
+ ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);<br>
}<br>
<br>
unsigned AMDGPUMachineFunction::<wbr>allocateLDSGlobal(const DataLayout &DL,<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUMachineFunction.h<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.h?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/AMDGPUMachineFunction.<wbr>h?rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUMachineFunction.h (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUMachineFunction.h Fri Jul 13 09:40:25 2018<br>
@@ -23,8 +23,8 @@ class AMDGPUMachineFunction : public Mac<br>
SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;<br>
<br>
protected:<br>
- uint64_t ExplicitKernArgSize;<br>
- unsigned MaxKernArgAlign;<br>
+ uint64_t ExplicitKernArgSize; // Cache for this.<br>
+ unsigned MaxKernArgAlign; // Cache for this.<br>
<br>
/// Number of bytes in the LDS that are being used.<br>
unsigned LDSSize;<br>
@@ -44,17 +44,6 @@ protected:<br>
public:<br>
AMDGPUMachineFunction(const MachineFunction &MF);<br>
<br>
- uint64_t allocateKernArg(uint64_t Size, unsigned Align) {<br>
- assert(isPowerOf2_32(Align));<br>
- ExplicitKernArgSize = alignTo(ExplicitKernArgSize, Align);<br>
-<br>
- uint64_t Result = ExplicitKernArgSize;<br>
- ExplicitKernArgSize += Size;<br>
-<br>
- MaxKernArgAlign = std::max(Align, MaxKernArgAlign);<br>
- return Result;<br>
- }<br>
-<br>
uint64_t getExplicitKernArgSize() const {<br>
return ExplicitKernArgSize;<br>
}<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUSubtarget.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/AMDGPUSubtarget.cpp?<wbr>rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUSubtarget.cpp (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUSubtarget.cpp Fri Jul 13 09:40:25 2018<br>
@@ -209,7 +209,7 @@ GCNSubtarget::GCNSubtarget(<wbr>const Triple<br>
<br>
FeatureDisable(false),<br>
InstrInfo(<wbr>initializeSubtargetDependencie<wbr>s(TT, GPU, FS)),<br>
- TLInfo(TM, *this), <br>
+ TLInfo(TM, *this),<br>
FrameLowering(<wbr>TargetFrameLowering::<wbr>StackGrowsUp, getStackAlignment(), 0) {<br>
AS = AMDGPU::getAMDGPUAS(TT);<br>
CallLoweringInfo.reset(new AMDGPUCallLowering(*<wbr>getTargetLowering()));<br>
@@ -406,6 +406,44 @@ bool AMDGPUSubtarget::<wbr>makeLIDRangeMetada<br>
return true;<br>
}<br>
<br>
+uint64_t AMDGPUSubtarget::<wbr>getExplicitKernArgSize(const Function &F,<br>
+ unsigned &MaxAlign) const {<br>
+ assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||<br>
+ F.getCallingConv() == CallingConv::SPIR_KERNEL);<br>
+<br>
+ const DataLayout &DL = F.getParent()->getDataLayout()<wbr>;<br>
+ uint64_t ExplicitArgBytes = 0;<br>
+ MaxAlign = 1;<br>
+<br>
+ for (const Argument &Arg : F.args()) {<br>
+ Type *ArgTy = Arg.getType();<br>
+<br>
+ unsigned Align = DL.getABITypeAlignment(ArgTy);<br>
+ uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);<br>
+ ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;<br>
+ MaxAlign = std::max(MaxAlign, Align);<br>
+ }<br>
+<br>
+ return ExplicitArgBytes;<br>
+}<br>
+<br>
+unsigned AMDGPUSubtarget::<wbr>getKernArgSegmentSize(const Function &F,<br>
+ unsigned &MaxAlign) const {<br>
+ uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);<br>
+<br>
+ unsigned ExplicitOffset = getExplicitKernelArgOffset(F);<br>
+<br>
+ uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;<br>
+ unsigned ImplicitBytes = getImplicitArgNumBytes(F);<br>
+ if (ImplicitBytes != 0) {<br>
+ unsigned Alignment = getAlignmentForImplicitArgPtr(<wbr>);<br>
+ TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;<br>
+ }<br>
+<br>
+ // Being able to dereference past the end is useful for emitting scalar loads.<br>
+ return alignTo(TotalSize, 4);<br>
+}<br>
+<br>
R600Subtarget::R600Subtarget(<wbr>const Triple &TT, StringRef GPU, StringRef FS,<br>
const TargetMachine &TM) :<br>
R600GenSubtargetInfo(TT, GPU, FS),<br>
@@ -446,40 +484,6 @@ bool GCNSubtarget::<wbr>isVGPRSpillingEnabled<br>
return EnableVGPRSpilling || !AMDGPU::isShader(F.<wbr>getCallingConv());<br>
}<br>
<br>
-uint64_t GCNSubtarget::<wbr>getExplicitKernArgSize(const Function &F) const {<br>
- assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL);<br>
-<br>
- const DataLayout &DL = F.getParent()->getDataLayout()<wbr>;<br>
- uint64_t ExplicitArgBytes = 0;<br>
- for (const Argument &Arg : F.args()) {<br>
- Type *ArgTy = Arg.getType();<br>
-<br>
- unsigned Align = DL.getABITypeAlignment(ArgTy);<br>
- uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);<br>
- ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;<br>
- }<br>
-<br>
- return ExplicitArgBytes;<br>
-}<br>
-<br>
-unsigned GCNSubtarget::<wbr>getKernArgSegmentSize(const Function &F,<br>
- int64_t ExplicitArgBytes) const {<br>
- if (ExplicitArgBytes == -1)<br>
- ExplicitArgBytes = getExplicitKernArgSize(F);<br>
-<br>
- unsigned ExplicitOffset = getExplicitKernelArgOffset(F);<br>
-<br>
- uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;<br>
- unsigned ImplicitBytes = getImplicitArgNumBytes(F);<br>
- if (ImplicitBytes != 0) {<br>
- unsigned Alignment = getAlignmentForImplicitArgPtr(<wbr>);<br>
- TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;<br>
- }<br>
-<br>
- // Being able to dereference past the end is useful for emitting scalar loads.<br>
- return alignTo(TotalSize, 4);<br>
-}<br>
-<br>
unsigned GCNSubtarget::<wbr>getOccupancyWithNumSGPRs(<wbr>unsigned SGPRs) const {<br>
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_<wbr>ISLANDS) {<br>
if (SGPRs <= 80)<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUSubtarget.h<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/AMDGPUSubtarget.h?rev=<wbr>337021&r1=337020&r2=337021&<wbr>view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUSubtarget.h (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>AMDGPUSubtarget.h Fri Jul 13 09:40:25 2018<br>
@@ -51,7 +51,7 @@ public:<br>
enum Generation {<br>
R600 = 0,<br>
R700 = 1,<br>
- EVERGREEN = 2, <br>
+ EVERGREEN = 2,<br>
NORTHERN_ISLANDS = 3,<br>
SOUTHERN_ISLANDS = 4,<br>
SEA_ISLANDS = 5,<br>
@@ -82,7 +82,7 @@ public:<br>
<br>
static const AMDGPUSubtarget &get(const MachineFunction &MF);<br>
static const AMDGPUSubtarget &get(const TargetMachine &TM,<br>
- const Function &F);<br>
+ const Function &F);<br>
<br>
/// \returns Default range flat work group size for a calling convention.<br>
std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(<wbr>CallingConv::ID CC) const;<br>
@@ -231,6 +231,18 @@ public:<br>
/// Creates value range metadata on an workitemid.* inrinsic call or load.<br>
bool makeLIDRangeMetadata(<wbr>Instruction *I) const;<br>
<br>
+ /// \returns Number of bytes of arguments that are passed to a shader or<br>
+ /// kernel in addition to the explicit ones declared for the function.<br>
+ unsigned getImplicitArgNumBytes(const Function &F) const {<br>
+ if (isMesaKernel(F))<br>
+ return 16;<br>
+ return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes"<wbr>, 0);<br>
+ }<br>
+ uint64_t getExplicitKernArgSize(const Function &F,<br>
+ unsigned &MaxAlign) const;<br>
+ unsigned getKernArgSegmentSize(const Function &F,<br>
+ unsigned &MaxAlign) const;<br>
+<br>
virtual ~AMDGPUSubtarget() {}<br>
};<br>
<br>
@@ -669,14 +681,6 @@ public:<br>
return D16PreservesUnusedBits;<br>
}<br>
<br>
- /// \returns Number of bytes of arguments that are passed to a shader or<br>
- /// kernel in addition to the explicit ones declared for the function.<br>
- unsigned getImplicitArgNumBytes(const Function &F) const {<br>
- if (isMesaKernel(F))<br>
- return 16;<br>
- return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes"<wbr>, 0);<br>
- }<br>
-<br>
// Scratch is allocated in 256 dword per wave blocks for the entire<br>
// wavefront. When viewed from the perspecive of an arbitrary workitem, this<br>
// is 4-byte aligned.<br>
@@ -825,10 +829,6 @@ public:<br>
return getGeneration() >= AMDGPUSubtarget::VOLCANIC_<wbr>ISLANDS;<br>
}<br>
<br>
- uint64_t getExplicitKernArgSize(const Function &F) const;<br>
- unsigned getKernArgSegmentSize(const Function &F,<br>
- int64_t ExplicitArgBytes = -1) const;<br>
-<br>
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs<br>
/// SGPRs<br>
unsigned getOccupancyWithNumSGPRs(<wbr>unsigned SGPRs) const;<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>R600.td<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/R600.td?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/R600.td?rev=337021&r1=<wbr>337020&r2=337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>R600.td (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>R600.td Fri Jul 13 09:40:25 2018<br>
@@ -52,8 +52,3 @@ def CC_R600 : CallingConv<[<br>
T30_XYZW, T31_XYZW, T32_XYZW<br>
]>>><br>
]>;<br>
-<br>
-// Calling convention for compute kernels<br>
-def CC_R600_Kernel : CallingConv<[<br>
- CCCustom<"allocateKernArg"><br>
-]>;<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>R600ISelLowering.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/R600ISelLowering.cpp?<wbr>rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>R600ISelLowering.cpp (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>R600ISelLowering.cpp Fri Jul 13 09:40:25 2018<br>
@@ -50,18 +50,6 @@<br>
<br>
using namespace llvm;<br>
<br>
-static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,<br>
- CCValAssign::LocInfo LocInfo,<br>
- ISD::ArgFlagsTy ArgFlags, CCState &State) {<br>
- MachineFunction &MF = State.getMachineFunction();<br>
- AMDGPUMachineFunction *MFI = MF.getInfo<<wbr>AMDGPUMachineFunction>();<br>
-<br>
- uint64_t Offset = MFI->allocateKernArg(LocVT.<wbr>getStoreSize(),<br>
- ArgFlags.getOrigAlign());<br>
- State.addLoc(CCValAssign::<wbr>getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));<br>
- return true;<br>
-}<br>
-<br>
#include "R600GenCallingConv.inc"<br>
<br>
R600TargetLowering::<wbr>R600TargetLowering(const TargetMachine &TM,<br>
@@ -234,7 +222,7 @@ R600TargetLowering::<wbr>R600TargetLowering(c<br>
setOperationAction(ISD::FMA, MVT::f32, Expand);<br>
setOperationAction(ISD::FMA, MVT::f64, Expand);<br>
}<br>
- <br>
+<br>
// FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we<br>
// need it for R600.<br>
if (!Subtarget->hasFP32Denormals(<wbr>))<br>
@@ -1583,7 +1571,7 @@ CCAssignFn *R600TargetLowering::CCAssign<br>
case CallingConv::C:<br>
case CallingConv::Fast:<br>
case CallingConv::Cold:<br>
- return CC_R600_Kernel;<br>
+ llvm_unreachable("kernels should not be handled here");<br>
case CallingConv::AMDGPU_VS:<br>
case CallingConv::AMDGPU_GS:<br>
case CallingConv::AMDGPU_PS:<br>
@@ -1658,13 +1646,12 @@ SDValue R600TargetLowering::<wbr>LowerFormalA<br>
<br>
unsigned ValBase = ArgLocs[In.getOrigArgIndex()].<wbr>getLocMemOffset();<br>
unsigned PartOffset = VA.getLocMemOffset();<br>
- unsigned Offset = Subtarget-><wbr>getExplicitKernelArgOffset(MF.<wbr>getFunction()) +<br>
- VA.getLocMemOffset();<br>
<br>
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)<wbr>, PartOffset - ValBase);<br>
SDValue Arg = DAG.getLoad(<br>
ISD::UNINDEXED, Ext, VT, DL, Chain,<br>
- DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo,<br>
+ DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32),<br>
+ PtrInfo,<br>
MemVT, /* Alignment = */ 4, MachineMemOperand::<wbr>MONonTemporal |<br>
MachineMemOperand::<wbr>MODereferenceable |<br>
MachineMemOperand::<wbr>MOInvariant);<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>SIISelLowering.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/SIISelLowering.cpp?rev=<wbr>337021&r1=337020&r2=337021&<wbr>view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>SIISelLowering.cpp (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>SIISelLowering.cpp Fri Jul 13 09:40:25 2018<br>
@@ -1164,8 +1164,8 @@ SDValue SITargetLowering::<wbr>lowerKernargMe<br>
// Try to avoid using an extload by loading earlier than the argument address,<br>
// and extracting the relevant bits. The load should hopefully be merged with<br>
// the previous argument.<br>
- if (Align < 4) {<br>
- assert(MemVT.getStoreSize() < 4);<br>
+ if (MemVT.getStoreSize() < 4 && Align < 4) {<br>
+ // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).<br>
int64_t AlignDownOffset = alignDown(Offset, 4);<br>
int64_t OffsetDiff = Offset - AlignDownOffset;<br>
<br>
@@ -1781,7 +1781,6 @@ SDValue SITargetLowering::<wbr>LowerFormalArg<br>
// FIXME: Alignment of explicit arguments totally broken with non-0 explicit<br>
// kern arg offset.<br>
const unsigned KernelArgBaseAlign = 16;<br>
- const unsigned ExplicitOffset = Subtarget-><wbr>getExplicitKernelArgOffset(Fn)<wbr>;<br>
<br>
for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {<br>
const ISD::InputArg &Arg = Ins[i];<br>
@@ -1797,11 +1796,9 @@ SDValue SITargetLowering::<wbr>LowerFormalArg<br>
VT = Ins[i].VT;<br>
EVT MemVT = VA.getLocVT();<br>
<br>
- const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset();<br>
+ const uint64_t Offset = VA.getLocMemOffset();<br>
unsigned Align = MinAlign(KernelArgBaseAlign, Offset);<br>
<br>
- // The first 36 bytes of the input buffer contains information about<br>
- // thread group and global sizes for clover.<br>
SDValue Arg = lowerKernargMemParameter(<br>
DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);<br>
Chains.push_back(Arg.getValue(<wbr>1));<br>
<br>
Modified: llvm/trunk/lib/Target/AMDGPU/<wbr>SIMachineFunctionInfo.cpp<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/lib/Target/<wbr>AMDGPU/SIMachineFunctionInfo.<wbr>cpp?rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/lib/Target/AMDGPU/<wbr>SIMachineFunctionInfo.cpp (original)<br>
+++ llvm/trunk/lib/Target/AMDGPU/<wbr>SIMachineFunctionInfo.cpp Fri Jul 13 09:40:25 2018<br>
@@ -54,6 +54,16 @@ SIMachineFunctionInfo::<wbr>SIMachineFunction<br>
<br>
Occupancy = getMaxWavesPerEU();<br>
limitOccupancy(MF);<br>
+ CallingConv::ID CC = F.getCallingConv();<br>
+<br>
+ if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {<br>
+ if (!F.arg_empty())<br>
+ KernargSegmentPtr = true;<br>
+ WorkGroupIDX = true;<br>
+ WorkItemIDX = true;<br>
+ } else if (CC == CallingConv::AMDGPU_PS) {<br>
+ PSInputAddr = AMDGPU::getInitialPSInputAddr(<wbr>F);<br>
+ }<br>
<br>
if (!isEntryFunction()) {<br>
// Non-entry functions have no special inputs for now, other registers<br>
@@ -73,21 +83,11 @@ SIMachineFunctionInfo::<wbr>SIMachineFunction<br>
} else {<br>
if (F.hasFnAttribute("amdgpu-<wbr>implicitarg-ptr")) {<br>
KernargSegmentPtr = true;<br>
- assert(MaxKernArgAlign == 0);<br>
- MaxKernArgAlign = ST.<wbr>getAlignmentForImplicitArgPtr(<wbr>);<br>
+ MaxKernArgAlign = std::max(ST.<wbr>getAlignmentForImplicitArgPtr(<wbr>),<br>
+ MaxKernArgAlign);<br>
}<br>
}<br>
<br>
- CallingConv::ID CC = F.getCallingConv();<br>
- if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {<br>
- if (!F.arg_empty())<br>
- KernargSegmentPtr = true;<br>
- WorkGroupIDX = true;<br>
- WorkItemIDX = true;<br>
- } else if (CC == CallingConv::AMDGPU_PS) {<br>
- PSInputAddr = AMDGPU::getInitialPSInputAddr(<wbr>F);<br>
- }<br>
-<br>
if (ST.debuggerEmitPrologue()) {<br>
// Enable everything.<br>
WorkGroupIDX = true;<br>
<br>
Modified: llvm/trunk/test/CodeGen/<wbr>AMDGPU/kernel-args.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/kernel-args.ll?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/test/<wbr>CodeGen/AMDGPU/kernel-args.ll?<wbr>rev=337021&r1=337020&r2=<wbr>337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/test/CodeGen/<wbr>AMDGPU/kernel-args.ll (original)<br>
+++ llvm/trunk/test/CodeGen/<wbr>AMDGPU/kernel-args.ll Fri Jul 13 09:40:25 2018<br>
@@ -589,6 +589,17 @@ entry:<br>
; ret void<br>
; }<br>
<br>
+; FUNC-LABEL: {{^}}i65_arg:<br>
+; HSA-VI: kernarg_segment_byte_size = 24<br>
+; HSA-VI: kernarg_segment_alignment = 4<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8<br>
+define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {<br>
+entry:<br>
+ store i65 %in, i65 addrspace(1)* %out, align 4<br>
+ ret void<br>
+}<br>
+<br>
; FUNC-LABEL: {{^}}i1_arg:<br>
; HSA-VI: kernarg_segment_byte_size = 12<br>
; HSA-VI: kernarg_segment_alignment = 4<br>
@@ -651,7 +662,7 @@ define amdgpu_kernel void @i1_arg_sext_i<br>
}<br>
<br>
; FUNC-LABEL: {{^}}empty_struct_arg:<br>
-; HSA: kernarg_segment_byte_size = 0<br>
+; HSA-VI: kernarg_segment_byte_size = 0<br>
define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {<br>
ret void<br>
}<br>
@@ -667,11 +678,11 @@ define amdgpu_kernel void @empty_struct_<br>
<br>
; FIXME: Total argument size is computed wrong<br>
; FUNC-LABEL: {{^}}struct_argument_<wbr>alignment:<br>
-; HSA: kernarg_segment_byte_size = 40<br>
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8<br>
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18<br>
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20<br>
+; HSA-VI: kernarg_segment_byte_size = 40<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20<br>
define amdgpu_kernel void @struct_argument_alignment({<wbr>i32, i64} %arg0, i8, {i32, i64} %arg1) {<br>
%val0 = extractvalue {i32, i64} %arg0, 0<br>
%val1 = extractvalue {i32, i64} %arg0, 1<br>
@@ -687,11 +698,11 @@ define amdgpu_kernel void @struct_argume<br>
; No padding between i8 and next struct, but round up at end to 4 byte<br>
; multiple.<br>
; FUNC-LABEL: {{^}}packed_struct_argument_<wbr>alignment:<br>
-; HSA: kernarg_segment_byte_size = 28<br>
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4<br>
-; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc<br>
-; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10<br>
+; HSA-VI: kernarg_segment_byte_size = 28<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10<br>
define amdgpu_kernel void @packed_struct_argument_<wbr>alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {<br>
%val0 = extractvalue <{i32, i64}> %arg0, 0<br>
%val1 = extractvalue <{i32, i64}> %arg0, 1<br>
@@ -703,3 +714,47 @@ define amdgpu_kernel void @packed_struct<br>
store volatile i64 %val3, i64 addrspace(1)* null<br>
ret void<br>
}<br>
+<br>
+; GCN-LABEL: {{^}}struct_argument_<wbr>alignment_after:<br>
+; HSA-VI: kernarg_segment_byte_size = 64<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20<br>
+; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30<br>
+define amdgpu_kernel void @struct_argument_alignment_<wbr>after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {<br>
+ %val0 = extractvalue {i32, i64} %arg0, 0<br>
+ %val1 = extractvalue {i32, i64} %arg0, 1<br>
+ %val2 = extractvalue {i32, i64} %arg2, 0<br>
+ %val3 = extractvalue {i32, i64} %arg2, 1<br>
+ store volatile i32 %val0, i32 addrspace(1)* null<br>
+ store volatile i64 %val1, i64 addrspace(1)* null<br>
+ store volatile i32 %val2, i32 addrspace(1)* null<br>
+ store volatile i64 %val3, i64 addrspace(1)* null<br>
+ store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null<br>
+ ret void<br>
+}<br>
+<br>
+; GCN-LABEL: {{^}}array_3xi32:<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc<br>
+define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {<br>
+ store volatile i16 %arg0, i16 addrspace(1)* undef<br>
+ store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef<br>
+ ret void<br>
+}<br>
+<br>
+; FIXME: Why not all scalar loads?<br>
+; GCN-LABEL: {{^}}array_3xi16:<br>
+; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2<br>
+; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0<br>
+; HSA-VI: flat_load_ushort<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4<br>
+define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {<br>
+ store volatile i8 %arg0, i8 addrspace(1)* undef<br>
+ store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef<br>
+ ret void<br>
+}<br>
<br>
Added: llvm/trunk/test/CodeGen/<wbr>AMDGPU/kernel-argument-dag-<wbr>lowering.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll?rev=337021&view=auto" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/test/<wbr>CodeGen/AMDGPU/kernel-<wbr>argument-dag-lowering.ll?rev=<wbr>337021&view=auto</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/test/CodeGen/<wbr>AMDGPU/kernel-argument-dag-<wbr>lowering.ll (added)<br>
+++ llvm/trunk/test/CodeGen/<wbr>AMDGPU/kernel-argument-dag-<wbr>lowering.ll Fri Jul 13 09:40:25 2018<br>
@@ -0,0 +1,132 @@<br>
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-<wbr>arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,<wbr>FUNC %s<br>
+<br>
+; Repeat of some problematic tests in kernel-args.ll, with the IR<br>
+; argument lowering pass disabled. Struct padding needs to be<br>
+; accounted for, as well as legalization of types changing offsets.<br>
+<br>
+; FUNC-LABEL: {{^}}i1_arg:<br>
+; HSA-VI: kernarg_segment_byte_size = 12<br>
+; HSA-VI: kernarg_segment_alignment = 4<br>
+<br>
+; GCN: s_load_dword s<br>
+; GCN: s_and_b32<br>
+define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {<br>
+ store i1 %x, i1 addrspace(1)* %out, align 1<br>
+ ret void<br>
+}<br>
+<br>
+; FUNC-LABEL: {{^}}v3i8_arg:<br>
+; HSA-VI: kernarg_segment_byte_size = 12<br>
+; HSA-VI: kernarg_segment_alignment = 4<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8<br>
+define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {<br>
+entry:<br>
+ store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4<br>
+ ret void<br>
+}<br>
+<br>
+; FUNC-LABEL: {{^}}i65_arg:<br>
+; HSA-VI: kernarg_segment_byte_size = 24<br>
+; HSA-VI: kernarg_segment_alignment = 4<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8<br>
+define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {<br>
+entry:<br>
+ store i65 %in, i65 addrspace(1)* %out, align 4<br>
+ ret void<br>
+}<br>
+<br>
+; FUNC-LABEL: {{^}}empty_struct_arg:<br>
+; HSA-VI: kernarg_segment_byte_size = 0<br>
+define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {<br>
+ ret void<br>
+}<br>
+<br>
+; The correct load offsets for these:<br>
+; load 4 from 0,<br>
+; load 8 from 8<br>
+; load 4 from 24<br>
+; load 8 from 32<br>
+<br>
+; With the SelectionDAG argument lowering, the alignments for the<br>
+; struct members is not properly considered, making these wrong.<br>
+<br>
+; FIXME: Total argument size is computed wrong<br>
+; FUNC-LABEL: {{^}}struct_argument_<wbr>alignment:<br>
+; HSA-VI: kernarg_segment_byte_size = 40<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20<br>
+define amdgpu_kernel void @struct_argument_alignment({<wbr>i32, i64} %arg0, i8, {i32, i64} %arg1) {<br>
+ %val0 = extractvalue {i32, i64} %arg0, 0<br>
+ %val1 = extractvalue {i32, i64} %arg0, 1<br>
+ %val2 = extractvalue {i32, i64} %arg1, 0<br>
+ %val3 = extractvalue {i32, i64} %arg1, 1<br>
+ store volatile i32 %val0, i32 addrspace(1)* null<br>
+ store volatile i64 %val1, i64 addrspace(1)* null<br>
+ store volatile i32 %val2, i32 addrspace(1)* null<br>
+ store volatile i64 %val3, i64 addrspace(1)* null<br>
+ ret void<br>
+}<br>
+<br>
+; No padding between i8 and next struct, but round up at end to 4 byte<br>
+; multiple.<br>
+; FUNC-LABEL: {{^}}packed_struct_argument_<wbr>alignment:<br>
+; HSA-VI: kernarg_segment_byte_size = 28<br>
+; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13<br>
+; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4<br>
+define amdgpu_kernel void @packed_struct_argument_<wbr>alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {<br>
+ %val0 = extractvalue <{i32, i64}> %arg0, 0<br>
+ %val1 = extractvalue <{i32, i64}> %arg0, 1<br>
+ %val2 = extractvalue <{i32, i64}> %arg1, 0<br>
+ %val3 = extractvalue <{i32, i64}> %arg1, 1<br>
+ store volatile i32 %val0, i32 addrspace(1)* null<br>
+ store volatile i64 %val1, i64 addrspace(1)* null<br>
+ store volatile i32 %val2, i32 addrspace(1)* null<br>
+ store volatile i64 %val3, i64 addrspace(1)* null<br>
+ ret void<br>
+}<br>
+<br>
+; GCN-LABEL: {{^}}struct_argument_<wbr>alignment_after:<br>
+; HSA-VI: kernarg_segment_byte_size = 64<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18<br>
+; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20<br>
+; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30<br>
+define amdgpu_kernel void @struct_argument_alignment_<wbr>after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {<br>
+ %val0 = extractvalue {i32, i64} %arg0, 0<br>
+ %val1 = extractvalue {i32, i64} %arg0, 1<br>
+ %val2 = extractvalue {i32, i64} %arg2, 0<br>
+ %val3 = extractvalue {i32, i64} %arg2, 1<br>
+ store volatile i32 %val0, i32 addrspace(1)* null<br>
+ store volatile i64 %val1, i64 addrspace(1)* null<br>
+ store volatile i32 %val2, i32 addrspace(1)* null<br>
+ store volatile i64 %val3, i64 addrspace(1)* null<br>
+ store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null<br>
+ ret void<br>
+}<br>
+<br>
+; GCN-LABEL: {{^}}array_3xi32:<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc<br>
+define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {<br>
+ store volatile i16 %arg0, i16 addrspace(1)* undef<br>
+ store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef<br>
+ ret void<br>
+}<br>
+<br>
+; GCN-LABEL: {{^}}array_3xi16:<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0<br>
+; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4<br>
+define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {<br>
+ store volatile i8 %arg0, i8 addrspace(1)* undef<br>
+ store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef<br>
+ ret void<br>
+}<br>
<br>
Modified: llvm/trunk/test/CodeGen/<wbr>AMDGPU/llvm.amdgcn.<wbr>implicitarg.ptr.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/test/<wbr>CodeGen/AMDGPU/llvm.amdgcn.<wbr>implicitarg.ptr.ll?rev=337021&<wbr>r1=337020&r2=337021&view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/test/CodeGen/<wbr>AMDGPU/llvm.amdgcn.<wbr>implicitarg.ptr.ll (original)<br>
+++ llvm/trunk/test/CodeGen/<wbr>AMDGPU/llvm.amdgcn.<wbr>implicitarg.ptr.ll Fri Jul 13 09:40:25 2018<br>
@@ -33,7 +33,7 @@ define amdgpu_kernel void @opencl_kernel<br>
; GCN: enable_sgpr_kernarg_segment_<wbr>ptr = 1<br>
<br>
; HSA: kernarg_segment_byte_size = 112<br>
-; MESA: kernarg_segment_byte_size = 464<br>
+; MESA: kernarg_segment_byte_size = 128<br>
<br>
; HSA: s_load_dword s0, s[4:5], 0x1c<br>
define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {<br>
@@ -47,7 +47,7 @@ define amdgpu_kernel void @kernel_implic<br>
; GCN: enable_sgpr_kernarg_segment_<wbr>ptr = 1<br>
<br>
; HSA: kernarg_segment_byte_size = 160<br>
-; MESA: kernarg_segment_byte_size = 464<br>
+; MESA: kernarg_segment_byte_size = 128<br>
<br>
; HSA: s_load_dword s0, s[4:5], 0x1c<br>
define amdgpu_kernel void @opencl_kernel_implicitarg_<wbr>ptr([112 x i8]) #1 {<br>
@@ -118,10 +118,10 @@ define amdgpu_kernel void @opencl_kernel<br>
; GCN-LABEL: {{^}}kernel_call_implicitarg_<wbr>ptr_func:<br>
; GCN: enable_sgpr_kernarg_segment_<wbr>ptr = 1<br>
; HSA: kernarg_segment_byte_size = 112<br>
-; MESA: kernarg_segment_byte_size = 464<br>
+; MESA: kernarg_segment_byte_size = 128<br>
<br>
; HSA: s_add_u32 s6, s4, 0x70<br>
-; MESA: s_add_u32 s6, s4, 0x1c0<br>
+; MESA: s_add_u32 s6, s4, 0x70<br>
<br>
; GCN: s_addc_u32 s7, s5, 0{{$}}<br>
; GCN: s_swappc_b64<br>
@@ -133,10 +133,9 @@ define amdgpu_kernel void @kernel_call_i<br>
; GCN-LABEL: {{^}}opencl_kernel_call_<wbr>implicitarg_ptr_func:<br>
; GCN: enable_sgpr_kernarg_segment_<wbr>ptr = 1<br>
; HSA: kernarg_segment_byte_size = 160<br>
-; MESA: kernarg_segment_byte_size = 464<br>
+; MESA: kernarg_segment_byte_size = 128<br>
<br>
-; HSA: s_add_u32 s6, s4, 0x70<br>
-; MESA: s_add_u32 s6, s4, 0x1c0<br>
+; GCN: s_add_u32 s6, s4, 0x70<br>
<br>
; GCN: s_addc_u32 s7, s5, 0{{$}}<br>
; GCN: s_swappc_b64<br>
@@ -219,8 +218,7 @@ define void @opencl_func_kernarg_implici<br>
<br>
; GCN-LABEL: {{^}}kernel_call_kernarg_<wbr>implicitarg_ptr_func:<br>
; GCN: s_mov_b64 s[6:7], s[4:5]<br>
-; HSA: s_add_u32 s8, s6, 0x70<br>
-; MESA: s_add_u32 s8, s6, 0x1c0<br>
+; GCN: s_add_u32 s8, s6, 0x70<br>
; GCN: s_addc_u32 s9, s7, 0<br>
; GCN: s_swappc_b64<br>
define amdgpu_kernel void @kernel_call_kernarg_<wbr>implicitarg_ptr_func([112 x i8]) #0 {<br>
<br>
Modified: llvm/trunk/test/CodeGen/<wbr>AMDGPU/llvm.amdgcn.kernarg.<wbr>segment.ptr.ll<br>
URL: <a href="http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll?rev=337021&r1=337020&r2=337021&view=diff" rel="noreferrer" target="_blank">http://llvm.org/viewvc/llvm-<wbr>project/llvm/trunk/test/<wbr>CodeGen/AMDGPU/llvm.amdgcn.<wbr>kernarg.segment.ptr.ll?rev=<wbr>337021&r1=337020&r2=337021&<wbr>view=diff</a><br>
==============================<wbr>==============================<wbr>==================<br>
--- llvm/trunk/test/CodeGen/<wbr>AMDGPU/llvm.amdgcn.kernarg.<wbr>segment.ptr.ll (original)<br>
+++ llvm/trunk/test/CodeGen/<wbr>AMDGPU/llvm.amdgcn.kernarg.<wbr>segment.ptr.ll Fri Jul 13 09:40:25 2018<br>
@@ -79,7 +79,7 @@ define amdgpu_kernel void @opencl_test_i<br>
; CO-V2: enable_sgpr_kernarg_segment_<wbr>ptr = 1<br>
; HSA: kernarg_segment_byte_size = 0<br>
; OS-MESA3D: kernarg_segment_byte_size = 16<br>
-; CO-V2: kernarg_segment_alignment = 32<br>
+; CO-V2: kernarg_segment_alignment = 4<br>
<br>
; HSA: s_load_dword s{{[0-9]+}}, s[4:5]<br>
define amdgpu_kernel void @test_no_kernargs() #1 {<br>
<br>
<br>
______________________________<wbr>_________________<br>
llvm-commits mailing list<br>
<a href="mailto:llvm-commits@lists.llvm.org">llvm-commits@lists.llvm.org</a><br>
<a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits" rel="noreferrer" target="_blank">http://lists.llvm.org/cgi-bin/<wbr>mailman/listinfo/llvm-commits</a><br>
</blockquote></div><br></div>