[Openmp-commits] [llvm] [openmp] [OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP behavior (PR #74080)
Michael Halkenhäuser via Openmp-commits
openmp-commits at lists.llvm.org
Tue Mar 5 06:18:19 PST 2024
https://github.com/mhalk updated https://github.com/llvm/llvm-project/pull/74080
>From 0f191fe5de314ae8790bd9636ccc8e2d54146710 Mon Sep 17 00:00:00 2001
From: Michael Halkenhaeuser <MichaelGerald.Halkenhauser at amd.com>
Date: Fri, 1 Dec 2023 07:55:01 -0600
Subject: [PATCH] [OpenMP][AMDGPU] Adapt dynamic callstack sizes to HIP
behavior
Added a mechanism to cap values provided via LIBOMPTARGET_STACK_SIZE to a
GFX-dependent value.
Changed several minor properties to be in sync with HIP:
1. Default device stack size: 1024 / 1 KiB (hipLimitStackSize).
2. During AQL packet generation in case of a dyn callstack the maximum
between user-provided and compiler-default is chosen.
3. Make sure we only allow 32bit values for stack size.
Added testcase where a dynamic stack is required due to recursion.
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 17 ++--
llvm/lib/Target/AMDGPU/AMDGPUFeatures.td | 15 ++++
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 +
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 14 ----
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 26 ++++++
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 21 +++++
.../plugins-nextgen/amdgpu/src/rtl.cpp | 38 +++++++--
.../test/offloading/dynamic_callstack.c | 80 +++++++++++++++++++
8 files changed, 185 insertions(+), 29 deletions(-)
create mode 100644 openmp/libomptarget/test/offloading/dynamic_callstack.c
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7c278fd574ede51..881cd545c688388 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1043,7 +1043,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
- FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+ FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureMaxWaveScratchSize13x256
]
>;
@@ -1054,7 +1055,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
- FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
+ FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureMaxWaveScratchSize13x256
]
>;
@@ -1070,7 +1072,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
- FeatureDefaultComponentZero
+ FeatureDefaultComponentZero, FeatureMaxWaveScratchSize13x256
]
>;
@@ -1088,7 +1090,8 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
+ FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero,
+ FeatureMaxWaveScratchSize13x256
]
>;
@@ -1109,7 +1112,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
FeatureGDS, FeatureGWS, FeatureDefaultComponentZero,
- FeatureMaxHardClauseLength63
+ FeatureMaxHardClauseLength63, FeatureMaxWaveScratchSize13x256
]
>;
@@ -1130,7 +1133,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
FeatureGWS, FeatureDefaultComponentZero,
- FeatureMaxHardClauseLength32
+ FeatureMaxHardClauseLength32, FeatureMaxWaveScratchSize15x64
]
>;
@@ -1151,7 +1154,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast,
- FeatureMaxHardClauseLength32
+ FeatureMaxHardClauseLength32, FeatureMaxWaveScratchSize18x64
]
>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index 3533087bbfd1bf2..39f7f52408c41bf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -51,3 +51,18 @@ def FeaturePromoteAlloca : SubtargetFeature <"promote-alloca",
"Enable promote alloca pass"
>;
+class SubtargetFeatureMaxWaveScratchSize <int Value, int FieldSize,
+ int Elements> :
+ SubtargetFeature<
+ "maxwavescratchsize"#FieldSize#"x"#Elements,
+ "MaxWaveScratchSize",
+ !cast<string>(Value),
+ "The dynamic callstack size in bytes"
+>;
+
+def FeatureMaxWaveScratchSize13x256 :
+ SubtargetFeatureMaxWaveScratchSize<8387584, 13, 256>;
+def FeatureMaxWaveScratchSize15x64 :
+ SubtargetFeatureMaxWaveScratchSize<8388352, 15, 64>;
+def FeatureMaxWaveScratchSize18x64 :
+ SubtargetFeatureMaxWaveScratchSize<67108608, 18, 64>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b72697973be7a11..19f70bcc1733cef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -68,6 +68,7 @@ class AMDGPUSubtarget {
unsigned MaxWavesPerEU = 10;
unsigned LocalMemorySize = 0;
unsigned AddressableLocalMemorySize = 0;
+ unsigned MaxWaveScratchSize = 0;
char WavefrontSizeLog2 = 0;
public:
@@ -234,6 +235,8 @@ class AMDGPUSubtarget {
return AddressableLocalMemorySize;
}
+ unsigned getMaxWaveScratchSize() const { return MaxWaveScratchSize; }
+
/// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
/// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
/// CU mode into account.
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a933c16b6ed516c..78dcaf15cf06190 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -302,20 +302,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return (Generation)Gen;
}
- unsigned getMaxWaveScratchSize() const {
- // See COMPUTE_TMPRING_SIZE.WAVESIZE.
- if (getGeneration() >= GFX12) {
- // 18-bit field in units of 64-dword.
- return (64 * 4) * ((1 << 18) - 1);
- }
- if (getGeneration() == GFX11) {
- // 15-bit field in units of 64-dword.
- return (64 * 4) * ((1 << 15) - 1);
- }
- // 13-bit field in units of 256-dword.
- return (256 * 4) * ((1 << 13) - 1);
- }
-
/// Return the number of high bits known to be zero for a frame index.
unsigned getKnownHighZeroBitsForFrameIndex() const {
return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 63285c06edaf2ce..7026b52b99cfbe6 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -203,6 +203,16 @@ uint8_t getELFABIVersion(const Triple &T, unsigned CodeObjectVersion) {
}
}
+unsigned getMaxWaveScratchSize(SubtargetGeneration Generation) {
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE.
+ if (Generation < SubtargetGeneration::GFX11) {
+ // 13-bit field in units of 256-dword.
+ return (256 * 4) * ((1 << 13) - 1);
+ }
+ // 15-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 15) - 1);
+}
+
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
switch (CodeObjectVersion) {
case AMDHSA_COV4:
@@ -900,6 +910,22 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) {
return 0;
}
+unsigned getMaxWaveScratchSize(const MCSubtargetInfo *STI) {
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE.
+ if (STI->getFeatureBits().test(FeatureMaxWaveScratchSize18x64)) {
+ // 18-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 18) - 1);
+ }
+
+ if (STI->getFeatureBits().test(FeatureMaxWaveScratchSize15x64)) {
+ // 15-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 15) - 1);
+ }
+
+ // 13-bit field in units of 256-dword.
+ return (256 * 4) * ((1 << 13) - 1);
+}
+
unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
// "Per CU" really means "per whatever functional block the waves of a
// workgroup must share". For gfx10 in CU mode this is the CU, which contains
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 9fcb4caca30b01f..fc0972b8211a0d3 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -82,6 +82,23 @@ unsigned getHostcallImplicitArgPosition(unsigned COV);
unsigned getDefaultQueueImplicitArgPosition(unsigned COV);
unsigned getCompletionActionImplicitArgPosition(unsigned COV);
+enum class SubtargetGeneration {
+ INVALID = 0,
+ R600 = 1,
+ R700 = 2,
+ EVERGREEN = 3,
+ NORTHERN_ISLANDS = 4,
+ SOUTHERN_ISLANDS = 5,
+ SEA_ISLANDS = 6,
+ VOLCANIC_ISLANDS = 7,
+ GFX9 = 8,
+ GFX10 = 9,
+ GFX11 = 10,
+ GFX12 = 11,
+};
+
+unsigned getMaxWaveScratchSize(SubtargetGeneration Generation);
+
struct GcnBufferFormatInfo {
unsigned Format;
unsigned BitsPerComp;
@@ -206,6 +223,10 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI);
/// \p STI.
unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI);
+/// \returns Maximum dynamic callstack size in bytes for given subtarget
+/// \p STI.
+unsigned getMaxWaveScratchSize(const MCSubtargetInfo *STI);
+
/// \returns Number of execution units per compute unit for given subtarget \p
/// STI.
unsigned getEUsPerCU(const MCSubtargetInfo *STI);
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 81634ae1edc4908..739e014cdaa36cf 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -704,7 +704,7 @@ struct AMDGPUQueueTy {
/// signal and can define an optional input signal (nullptr if none).
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize, uint64_t StackSize,
+ uint32_t GroupSize, uint32_t StackSize,
AMDGPUSignalTy *OutputSignal,
AMDGPUSignalTy *InputSignal) {
assert(OutputSignal && "Invalid kernel output signal");
@@ -743,7 +743,8 @@ struct AMDGPUQueueTy {
Packet->grid_size_y = 1;
Packet->grid_size_z = 1;
Packet->private_segment_size =
- Kernel.usesDynamicStack() ? StackSize : Kernel.getPrivateSize();
+ Kernel.usesDynamicStack() ? std::max(Kernel.getPrivateSize(), StackSize)
+ : Kernel.getPrivateSize();
Packet->group_segment_size = GroupSize;
Packet->kernel_object = Kernel.getKernelObject();
Packet->kernarg_address = KernelArgs;
@@ -1212,7 +1213,7 @@ struct AMDGPUStreamTy {
/// the kernel args buffer to the specified memory manager.
Error pushKernelLaunch(const AMDGPUKernelTy &Kernel, void *KernelArgs,
uint32_t NumThreads, uint64_t NumBlocks,
- uint32_t GroupSize, uint64_t StackSize,
+ uint32_t GroupSize, uint32_t StackSize,
AMDGPUMemoryManagerTy &MemoryManager) {
if (Queue == nullptr)
return Plugin::error("Target queue was nullptr");
@@ -1975,6 +1976,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
.contains("xnack+"))
IsXnackEnabled = true;
+ // See: 'getMaxWaveScratchSize' in 'llvm/lib/Target/AMDGPU/GCNSubtarget.h'.
+ // See: e.g. 'FeatureMaxWaveScratchSize13x256' in
+ // 'llvm/lib/Target/AMDGPU/AMDGPUFeatures.td'
+ // ToDo: Relay MaxWaveScratchSize value here
+ // MaxThreadScratchSize = GCNSubtarget.getMaxWaveScratchSize() /
+ // WavefrontSize;
+
// detect if device is an APU.
if (auto Err = checkIfAPU())
return Err;
@@ -2708,7 +2716,17 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
return Plugin::success();
}
Error setDeviceStackSize(uint64_t Value) override {
- StackSize = Value;
+ if (Value > MaxThreadScratchSize) {
+ // Cap device scratch size.
+ MESSAGE("Scratch memory size will be set to %d. Reason: Requested size "
+ "%ld would exceed available resources.",
+ MaxThreadScratchSize, Value);
+ StackSize = MaxThreadScratchSize;
+ } else {
+ // Apply device scratch size, since it is within limits.
+ StackSize = Value;
+ }
+
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
@@ -2896,9 +2914,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// The current size of the global device memory pool (managed by us).
uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
- /// The current size of the stack that will be used in cases where it could
- /// not be statically determined.
- uint64_t StackSize = 16 * 1024 /* 16 KB */;
+ /// Default: 1024, in conformity to hipLimitStackSize.
+ uint64_t StackSize = 1024 /* 1 KiB */;
+
+ // The maximum scratch memory size per thread.
+ // See COMPUTE_TMPRING_SIZE.WAVESIZE (divided by threads per wave).
+ uint32_t MaxThreadScratchSize;
/// Is the plugin associated with an APU?
bool IsAPU = false;
@@ -3314,7 +3335,8 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
- GroupSize, StackSize, ArgsMemoryManager);
+ GroupSize, static_cast<uint32_t>(StackSize),
+ ArgsMemoryManager);
}
Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
diff --git a/openmp/libomptarget/test/offloading/dynamic_callstack.c b/openmp/libomptarget/test/offloading/dynamic_callstack.c
new file mode 100644
index 000000000000000..9de30d7b7b690b0
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/dynamic_callstack.c
@@ -0,0 +1,80 @@
+#include <omp.h>
+#include <stdio.h>
+
+// RUN: %libomptarget-compile-amdgcn-amd-amdhsa -O2 -mcode-object-version=5
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=4096 \
+// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN: | %fcheck-amdgcn-amd-amdhsa
+
+// RUN: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN: env LIBOMPTARGET_STACK_SIZE=131073 \
+// RUN: %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_EXCEEDED
+
+// TODO: Realize the following run in an acceptable manner.
+// Unfortunately with insufficient scratch mem size programs will hang.
+// Therefore, a timeout mechanism would help tremendously.
+// Additionally, we need to allow empty output / unsuccessful execution.
+
+// RUN?: env OMP_TARGET_OFFLOAD=MANDATORY \
+// RUN?: env LIBOMPTARGET_STACK_SIZE=16 \
+// RUN?: timeout 10 %libomptarget-run-amdgcn-amd-amdhsa 2>&1 \
+// RUN?: | %fcheck-amdgcn-amd-amdhsa -check-prefix=LIMIT_INSUFFICIENT \
+// RUN?: --allow-empty
+
+// REQUIRES: amdgcn-amd-amdhsa
+
+// Cause the compiler to set amdhsa_uses_dynamic_stack to '1' using recursion.
+// That is: stack requirement for main's target region may not be calculated.
+
+// This recursive function will eventually return 0.
+int recursiveFunc(const int Recursions) {
+ if (Recursions < 1)
+ return 0;
+
+ int j[Recursions];
+#pragma omp target private(j)
+ { ; }
+
+ return recursiveFunc(Recursions - 1);
+}
+
+int main() {
+ int N = 256;
+ int a[N];
+ int b[N];
+ int i;
+
+ for (i = 0; i < N; i++)
+ a[i] = 0;
+
+ for (i = 0; i < N; i++)
+ b[i] = i;
+
+#pragma omp target parallel for
+ {
+ for (int j = 0; j < N; j++)
+ a[j] = b[j] + recursiveFunc(j);
+ }
+
+ int rc = 0;
+ for (i = 0; i < N; i++)
+ if (a[i] != b[i]) {
+ rc++;
+ printf("Wrong value: a[%d]=%d\n", i, a[i]);
+ }
+
+ if (!rc)
+ printf("Success\n");
+
+ return rc;
+}
+
+/// CHECK: Success
+
+/// LIMIT_EXCEEDED: Scratch memory size will be set to
+/// LIMIT_EXCEEDED: Success
+
+/// LIMIT_INSUFFICIENT-NOT: Success
More information about the Openmp-commits
mailing list