[llvm] 0136a44 - [OpenMP] Add an option to limit shared memory usage in OpenMPOpt

Joseph Huber via llvm-commits llvm-commits at lists.llvm.org
Fri Feb 18 05:35:49 PST 2022


Author: Joseph Huber
Date: 2022-02-18T08:35:26-05:00
New Revision: 0136a4401f90b1effd07b3ac8575fdb155f9984d

URL: https://github.com/llvm/llvm-project/commit/0136a4401f90b1effd07b3ac8575fdb155f9984d
DIFF: https://github.com/llvm/llvm-project/commit/0136a4401f90b1effd07b3ac8575fdb155f9984d.diff

LOG: [OpenMP] Add an option to limit shared memory usage in OpenMPOpt

One of the optimizations performed in OpenMPOpt pushes globalized
variables to static shared memory. This is preferable to keeping the
runtime call in all cases, however if too many variables are pushed to
hared memory the kernel will crash. Since this is an optimization and
not something the user specified explicitly, there should be an option
to limit this optimization in those cases. This path introduces the
`-openmp-opt-shared-limit=` option to limit the amount of bytes that
will be placed in shared memory from HeapToShared.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D120079

Added: 
    

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/replace_globalization.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 392b919c5a120..57a854f2e239e 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -129,6 +129,11 @@ static cl::opt<unsigned>
                           cl::desc("Maximal number of attributor iterations."),
                           cl::init(256));
 
+static cl::opt<unsigned>
+    SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden,
+                      cl::desc("Maximum amount of shared memory to use."),
+                      cl::init(std::numeric_limits<unsigned>::max()));
+
 STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
           "Number of OpenMP runtime calls deduplicated");
 STATISTIC(NumOpenMPParallelRegionsDeleted,
@@ -3000,6 +3005,14 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
 
       auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
 
+      if (AllocSize->getZExtValue() + SharedMemoryUsed > SharedMemoryLimit) {
+        LLVM_DEBUG(dbgs() << TAG << "Cannot replace call " << *CB
+                          << " with shared memory."
+                          << " Shared memory usage is limited to "
+                          << SharedMemoryLimit << " bytes\n");
+        continue;
+      }
+
       LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
                         << " with " << AllocSize->getZExtValue()
                         << " bytes of shared memory\n");
@@ -3034,7 +3047,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       A.deleteAfterManifest(*CB);
       A.deleteAfterManifest(*FreeCalls.front());
 
-      NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
+      SharedMemoryUsed += AllocSize->getZExtValue();
+      NumBytesMovedToSharedMemory = SharedMemoryUsed;
       Changed = ChangeStatus::CHANGED;
     }
 
@@ -3070,6 +3084,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
   SmallSetVector<CallBase *, 4> MallocCalls;
   /// Collection of potentially removed free calls in a function.
   SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
+  /// The total amount of shared memory that has been used for HeapToShared.
+  unsigned SharedMemoryUsed = 0;
 };
 
 struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {

diff  --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index aaf48e937ca05..efec38855c9a8 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
 ; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s
 ; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -disable-output < %s 2>&1 | FileCheck %s -check-prefix=CHECK-REMARKS
+; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -disable-output -openmp-opt-shared-limit=4 < %s 2>&1 | FileCheck %s -check-prefix=CHECK-LIMIT
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
@@ -8,6 +9,8 @@ target triple = "nvptx64"
 ; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory
 ; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
 ; CHECK-REMARKS-NOT: 6 bytes
+; CHECK-LIMIT: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory
+; CHECK-LIMIT: remark: replace_globalization.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization
 ; UTC_ARGS: --enable
 
 %struct.ident_t = type { i32, i32, i32, i32, i8* }


        


More information about the llvm-commits mailing list