[llvm] 0136a44 - [OpenMP] Add an option to limit shared memory usage in OpenMPOpt
Joseph Huber via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 18 05:35:49 PST 2022
Author: Joseph Huber
Date: 2022-02-18T08:35:26-05:00
New Revision: 0136a4401f90b1effd07b3ac8575fdb155f9984d
URL: https://github.com/llvm/llvm-project/commit/0136a4401f90b1effd07b3ac8575fdb155f9984d
DIFF: https://github.com/llvm/llvm-project/commit/0136a4401f90b1effd07b3ac8575fdb155f9984d.diff
LOG: [OpenMP] Add an option to limit shared memory usage in OpenMPOpt
One of the optimizations performed in OpenMPOpt pushes globalized
variables to static shared memory. This is preferable to keeping the
runtime call in all cases, however if too many variables are pushed to
hared memory the kernel will crash. Since this is an optimization and
not something the user specified explicitly, there should be an option
to limit this optimization in those cases. This path introduces the
`-openmp-opt-shared-limit=` option to limit the amount of bytes that
will be placed in shared memory from HeapToShared.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D120079
Added:
Modified:
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
llvm/test/Transforms/OpenMP/replace_globalization.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 392b919c5a120..57a854f2e239e 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -129,6 +129,11 @@ static cl::opt<unsigned>
cl::desc("Maximal number of attributor iterations."),
cl::init(256));
+static cl::opt<unsigned>
+ SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
+ cl::desc("Maximum amount of shared memory to use."),
+ cl::init(std::numeric_limits<unsigned>::max()));
+
STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
"Number of OpenMP runtime calls deduplicated");
STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -3000,6 +3005,14 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
+ if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
+ LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
+ << " with shared memory."
+ << " Shared memory usage is limited to "
+ << SharedMemoryLimit << " bytes\n");
+ continue;
+ }
+
LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
<< " with " << AllocSize->getZExtValue()
<< " bytes of shared memory\n");
@@ -3034,7 +3047,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
A.deleteAfterManifest(*CB);
A.deleteAfterManifest(*FreeCalls.front());
- NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
+ SharedMemoryUsed += AllocSize->getZExtValue();
+ NumBytesMovedToSharedMemory = SharedMemoryUsed;
Changed = ChangeStatus::CHANGED;
}
@@ -3070,6 +3084,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
SmallSetVector<CallBase *, 4> MallocCalls;
/// Collection of potentially removed free calls in a function.
SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
+ /// The total amount of shared memory that has been used for HeapToShared.
+ unsigned SharedMemoryUsed = 0;
};
struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index aaf48e937ca05..efec38855c9a8 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s
; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
+; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -disable-output -openmp-opt-shared-limit=4 < %s 2>&1 | FileCheck %s -check-prefix=CHECK-LIMIT
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64"
@@ -8,6 +9,8 @@ target triple = "nvptx64"
; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory
; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
; CHECK-REMARKS-NOT: 6 bytes
+; CHECK-LIMIT: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
+; CHECK-LIMIT: remark: replace_globalization.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization
; UTC_ARGS: --enable
%struct.ident_t = type { i32, i32, i32, i32, i8* }
More information about the llvm-commits
mailing list