[llvm] 90ff148 - AMDGPU: Account for implicit argument alignment for kernarg segment
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 9 14:48:51 PST 2021
Author: Matt Arsenault
Date: 2021-11-09T17:48:37-05:00
New Revision: 90ff14871904881fb156a1d4d5fb083ca75998ab
URL: https://github.com/llvm/llvm-project/commit/90ff14871904881fb156a1d4d5fb083ca75998ab
DIFF: https://github.com/llvm/llvm-project/commit/90ff14871904881fb156a1d4d5fb083ca75998ab.diff
LOG: AMDGPU: Account for implicit argument alignment for kernarg segment
If a kernel had no formal arguments but did have the implicit
arguments, we were reporting a required kernarg alignment of 4. For
some reason we require an 8-byte alignment for this, even though
there's no real advantage and I don't see where this is documented in
the ABI.
The code object header code also claims the minimum alignment is 16,
which is what I thought you always got at runtime anyway so I don't
know why this matters.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index ecb60089aca99..bb2e723f4ab06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1092,6 +1092,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
// kernarg_segment_alignment is specified as log of the alignment.
// The minimum alignment is 16.
+ // FIXME: The metadata treats the minimum as 4?
Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 3b4f92f613e77..b9c59f4c615a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -201,10 +201,11 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
Align MaxKernArgAlign;
HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
MaxKernArgAlign);
+ HSACodeProps.mKernargSegmentAlign =
+ std::max(MaxKernArgAlign, Align(4)).value();
+
HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
- HSACodeProps.mKernargSegmentAlign =
- std::max(MaxKernArgAlign, Align(4)).value();
HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
@@ -867,6 +868,8 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
Kern.getDocument()->getNode(ProgramInfo.LDSSize);
Kern[".private_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
+
+ // FIXME: The metadata treats the minimum as 16?
Kern[".kernarg_segment_align"] =
Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
Kern[".wavefront_size"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index d19431a59dbff..0655b4342ba10 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -688,6 +688,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
if (ImplicitBytes != 0) {
const Align Alignment = getAlignmentForImplicitArgPtr();
TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
+ MaxAlign = std::max(MaxAlign, Alignment);
}
// Being able to dereference past the end is useful for emitting scalar loads.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
index 7431832fa7d7e..e3ad5493b5c17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll
@@ -281,7 +281,7 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>
; HSA-LABEL: - Name: opencl_kernel_implicitarg_ptr_empty
; HSA: KernargSegmentSize: 48
-; HSA: KernargSegmentAlign: 4
+; HSA: KernargSegmentAlign: 8
; HSA-LABEL: - Name: kernel_implicitarg_ptr
; HSA: KernargSegmentSize: 112
@@ -289,7 +289,7 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>
; HSA-LABEL: - Name: opencl_kernel_implicitarg_ptr
; HSA: KernargSegmentSize: 160
-; HSA: KernargSegmentAlign: 4
+; HSA: KernargSegmentAlign: 8
; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func_empty
; HSA: KernargSegmentSize: 0
@@ -301,7 +301,7 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>
; HSA-LABEL: - Name: opencl_kernel_call_implicitarg_ptr_func_empty
; HSA: KernargSegmentSize: 48
-; HSA: KernargSegmentAlign: 4
+; HSA: KernargSegmentAlign: 8
; HSA-LABEL: - Name: kernel_call_implicitarg_ptr_func
; HSA: KernargSegmentSize: 112
@@ -309,7 +309,7 @@ define amdgpu_kernel void @kernel_implicitarg_no_struct_align_padding(<16 x i32>
; HSA-LABEL: - Name: opencl_kernel_call_implicitarg_ptr_func
; HSA: KernargSegmentSize: 160
-; HSA: KernargSegmentAlign: 4
+; HSA: KernargSegmentAlign: 8
; HSA-LABEL: - Name: kernel_call_kernarg_implicitarg_ptr_func
; HSA: KernargSegmentSize: 112
More information about the llvm-commits
mailing list