[llvm] 6fc51c9 - [OpenMP] Replace GPU globalization calls with shared memory in the middle-end

Tue Jun 22 09:09:51 PDT 2021

Author: Joseph Huber
Date: 2021-06-22T11:55:44-04:00
New Revision: 6fc51c9f7d6647ba78e5a235e7d8bfcf3ab2ede0

URL: https://github.com/llvm/llvm-project/commit/6fc51c9f7d6647ba78e5a235e7d8bfcf3ab2ede0
DIFF: https://github.com/llvm/llvm-project/commit/6fc51c9f7d6647ba78e5a235e7d8bfcf3ab2ede0.diff

LOG: [OpenMP] Replace GPU globalization calls with shared memory in the middle-end

Summary:
The changes introduced in D97680 create a simpler interface to code that needs
to be globalized. This interface is used to simplify the globalization calls in
the middle end. We can check any globalization call that is only called by a
single thread in the team and replace it with a static shared memory buffer.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D97818

Added: 
    llvm/test/Transforms/OpenMP/replace_globalization.ll

Modified: 
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 4450a2cfcbf67..9b43134e378aa 100644

--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
@@ -34,6 +35,7 @@
 #include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
 
+using namespace llvm::PatternMatch;
 using namespace llvm;
 using namespace omp;
 
@@ -75,6 +77,8 @@ STATISTIC(
     "Number of OpenMP parallel regions replaced with ID in GPU state machines");
 STATISTIC(NumOpenMPParallelRegionsMerged,
           "Number of OpenMP parallel regions merged");
+STATISTIC(NumBytesMovedToSharedMemory,
+          "Amount of memory pushed to shared memory");
 
 #if !defined(NDEBUG)
 static constexpr auto TAG = "[" DEBUG_TYPE "]";
@@ -82,6 +86,16 @@ static constexpr auto TAG = "[" DEBUG_TYPE "]";
 
 namespace {
 
+enum class AddressSpace : unsigned {
+  Generic = 0,
+  Global = 1,
+  Shared = 3,
+  Constant = 4,
+  Local = 5,
+};
+
+struct AAHeapToShared;
+
 struct AAICVTracker;
 
 /// OpenMP specific information. For now, stores RFIs and ICVs also needed for
@@ -512,6 +526,9 @@ struct OpenMPOpt {
     if (IsModulePass) {
       Changed |= runAttributor();
 
+      // Recollect uses, in case Attributor deleted any.
+      OMPInfoCache.recollectUses();
+
       if (remarksEnabled())
         analysisGlobalization();
     } else {
@@ -1122,28 +1139,23 @@ struct OpenMPOpt {
   }
 
   void analysisGlobalization() {
-    RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared,
-                                                 OMPRTL___kmpc_free_shared};
-
-    for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
-      auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];
-
-      auto CheckGlobalization = [&](Use &U, Function &Decl) {
-        if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
-          auto Remark = [&](OptimizationRemarkAnalysis ORA) {
-            return ORA
-                   << "Found thread data sharing on the GPU. "
-                   << "Expect degraded performance due to data globalization.";
-          };
-          emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization",
-                                                 Remark);
-        }
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
 
-        return false;
-      };
+    auto CheckGlobalization = [&](Use &U, Function &Decl) {
+      if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
+        auto Remark = [&](OptimizationRemarkAnalysis ORA) {
+          return ORA
+                 << "Found thread data sharing on the GPU. "
+                 << "Expect degraded performance due to data globalization.";
+        };
+        emitRemark<OptimizationRemarkAnalysis>(CI, "OpenMPGlobalization",
+                                               Remark);
+      }
 
-      RFI.foreachUse(SCC, CheckGlobalization);
-    }
+      return false;
+    };
+
+    RFI.foreachUse(SCC, CheckGlobalization);
   }
 
   /// Maps the values stored in the offload arrays passed as arguments to
@@ -1604,6 +1616,12 @@ struct OpenMPOpt {
 
       GetterRFI.foreachUse(SCC, CreateAA);
     }
+    auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+    auto CreateAA = [&](Use &U, Function &F) {
+      A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
+      return false;
+    };
+    GlobalizationRFI.foreachUse(SCC, CreateAA);
 
     for (auto &F : M) {
       if (!F.isDeclaration())
@@ -2321,7 +2339,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
   // a constant zero.
   // TODO: Use AAValueSimplify to simplify and propogate constants.
   // TODO: Check more than a single use for thread ID's.
-  auto IsSingleThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
+  auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
     if (!Edge || !Edge->isConditional())
       return false;
     if (Edge->getSuccessor(0) != SuccessorBB)
@@ -2331,6 +2349,21 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality())
       return false;
 
+    // Temporarily match the pattern generated by clang for teams regions.
+    // TODO: Remove this once the new runtime is in place.
+    ConstantInt *One, *NegOne;
+    CmpInst::Predicate Pred;
+    auto &&m_ThreadID = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_tid_x>();
+    auto &&m_WarpSize = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_warpsize>();
+    auto &&m_BlockSize = m_Intrinsic<Intrinsic::nvvm_read_ptx_sreg_ntid_x>();
+    if (match(Cmp, m_Cmp(Pred, m_ThreadID,
+                         m_And(m_Sub(m_BlockSize, m_ConstantInt(One)),
+                               m_Xor(m_Sub(m_WarpSize, m_ConstantInt(One)),
+                                     m_ConstantInt(NegOne))))))
+      if (One->isOne() && NegOne->isMinusOne() &&
+          Pred == CmpInst::Predicate::ICMP_EQ)
+        return true;
+
     ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
     if (!C || !C->isZero())
       return false;
@@ -2351,15 +2384,15 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     if (pred_begin(BB) == pred_end(BB))
       return SingleThreadedBBs.contains(BB);
 
-    bool IsSingleThreaded = true;
+    bool IsInitialThread = true;
     for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB);
          PredBB != PredEndBB; ++PredBB) {
-      if (!IsSingleThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()),
+      if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()),
                               BB))
-        IsSingleThreaded &= SingleThreadedBBs.contains(*PredBB);
+        IsInitialThread &= SingleThreadedBBs.contains(*PredBB);
     }
 
-    return IsSingleThreaded;
+    return IsInitialThread;
   };
 
   for (auto *BB : RPOT) {
@@ -2372,10 +2405,145 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
              : ChangeStatus::CHANGED;
 }
 
+/// Try to replace memory allocation calls called by a single thread with a
+/// static buffer of shared memory.
+struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
+  using Base = StateWrapper<BooleanState, AbstractAttribute>;
+  AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+  /// Create an abstract attribute view for the position \p IRP.
+  static AAHeapToShared &createForPosition(const IRPosition &IRP,
+                                           Attributor &A);
+
+  /// See AbstractAttribute::getName().
+  const std::string getName() const override { return "AAHeapToShared"; }
+
+  /// See AbstractAttribute::getIdAddr().
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is
+  /// AAHeapToShared.
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  /// Unique ID (due to the unique address)
+  static const char ID;
+};
+
+struct AAHeapToSharedFunction : public AAHeapToShared {
+  AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
+      : AAHeapToShared(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
+           " malloc calls eligible.";
+  }
+
+  /// See AbstractAttribute::trackStatistics().
+  void trackStatistics() const override {}
+
+  void initialize(Attributor &A) override {
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+
+    for (User *U : RFI.Declaration->users())
+      if (CallBase *CB = dyn_cast<CallBase>(U))
+        MallocCalls.insert(CB);
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (MallocCalls.empty())
+      return ChangeStatus::UNCHANGED;
+
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
+
+    Function *F = getAnchorScope();
+    auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this,
+                                            DepClassTy::OPTIONAL);
+
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    for (CallBase *CB : MallocCalls) {
+      // Skip replacing this if HeapToStack has already claimed it.
+      if (HS && HS->isKnownHeapToStack(*CB))
+        continue;
+
+      // Find the unique free call to remove it.
+      SmallVector<CallBase *, 4> FreeCalls;
+      for (auto *U : CB->users()) {
+        CallBase *C = dyn_cast<CallBase>(U);
+        if (C && C->getCalledFunction() == FreeCall.Declaration)
+          FreeCalls.push_back(C);
+      }
+      if (FreeCalls.size() != 1)
+        continue;
+
+      ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
+
+      LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in "
+                        << CB->getCaller()->getName() << " with "
+                        << AllocSize->getZExtValue()
+                        << " bytes of shared memory\n");
+
+      // Create a new shared memory buffer of the same size as the allocation
+      // and replace all the uses of the original allocation with it.
+      Module *M = CB->getModule();
+      Type *Int8Ty = Type::getInt8Ty(M->getContext());
+      Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
+      auto *SharedMem = new GlobalVariable(
+          *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
+          UndefValue::get(Int8ArrTy), CB->getName(), nullptr,
+          GlobalValue::NotThreadLocal,
+          static_cast<unsigned>(AddressSpace::Shared));
+      auto *NewBuffer =
+          ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo());
+
+      SharedMem->setAlignment(MaybeAlign(32));
+
+      A.changeValueAfterManifest(*CB, *NewBuffer);
+      A.deleteAfterManifest(*CB);
+      A.deleteAfterManifest(*FreeCalls.front());
+
+      NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
+      Changed = ChangeStatus::CHANGED;
+    }
+
+    return Changed;
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+    Function *F = getAnchorScope();
+
+    auto NumMallocCalls = MallocCalls.size();
+
+    // Only consider malloc calls executed by a single thread with a constant.
+    for (User *U : RFI.Declaration->users()) {
+      const auto &ED = A.getAAFor<AAExecutionDomain>(
+          *this, IRPosition::function(*F), DepClassTy::REQUIRED);
+      if (CallBase *CB = dyn_cast<CallBase>(U))
+        if (!dyn_cast<ConstantInt>(CB->getArgOperand(0)) ||
+            !ED.isExecutedByInitialThreadOnly(*CB))
+          MallocCalls.erase(CB);
+    }
+
+    if (NumMallocCalls != MallocCalls.size())
+      return ChangeStatus::CHANGED;
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// Collection of all malloc calls in a function.
+  SmallPtrSet<CallBase *, 4> MallocCalls;
+};
+
 } // namespace
 
 const char AAICVTracker::ID = 0;
 const char AAExecutionDomain::ID = 0;
+const char AAHeapToShared::ID = 0;
 
 AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
                                               Attributor &A) {
@@ -2424,6 +2592,27 @@ AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
   return *AA;
 }
 
+AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
+                                                  Attributor &A) {
+  AAHeapToSharedFunction *AA = nullptr;
+  switch (IRP.getPositionKind()) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+  case IRPosition::IRP_RETURNED:
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+  case IRPosition::IRP_CALL_SITE:
+    llvm_unreachable(
+        "AAHeapToShared can only be created for function position!");
+  case IRPosition::IRP_FUNCTION:
+    AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);
+    break;
+  }
+
+  return *AA;
+}
+
 PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
   if (!containsOpenMP(M, OMPInModule))
     return PreservedAnalyses::all();

diff  --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
new file mode 100644
index 0000000000000..5a513dd1046ca
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -0,0 +1,100 @@
+; RUN: opt -S -passes='openmp-opt' < %s | FileCheck %s
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64"
+
+; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef
+; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef
+
+; CHECK: %{{.*}} = call i8* @__kmpc_alloc_shared({{.*}})
+; CHECK: call void @__kmpc_free_shared({{.*}})
+define dso_local void @foo() {
+entry:
+  %x = call i8* @__kmpc_alloc_shared(i64 4)
+  %x_on_stack = bitcast i8* %x to i32*
+  %0 = bitcast i32* %x_on_stack to i8*
+  call void @use(i8* %0)
+  call void @__kmpc_free_shared(i8* %x)
+  ret void
+}
+
+define void @bar() {
+  call void @baz()
+  call void @qux()
+  ret void
+}
+
+; CHECK: %{{.*}} = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([16 x i8], [16 x i8] addrspace(3)* [[SHARED_X]], i32 0, i32 0) to i8*) to [4 x i32]*
+define internal void @baz() {
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %cmp = icmp eq i32 %tid, 0
+  br i1 %cmp, label %master, label %exit
+master:
+  %x = call i8* @__kmpc_alloc_shared(i64 16), !dbg !9
+  %x_on_stack = bitcast i8* %x to [4 x i32]*
+  %0 = bitcast [4 x i32]* %x_on_stack to i8*
+  call void @use(i8* %0)
+  call void @__kmpc_free_shared(i8* %x)
+  br label %exit
+exit:
+  ret void
+}
+
+; CHECK: %{{.*}} = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* [[SHARED_Y]], i32 0, i32 0) to i8*) to [4 x i32]*
+define internal void @qux() {
+entry:
+  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %ntid = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %warpsize = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  %0 = sub nuw i32 %warpsize, 1
+  %1 = sub nuw i32 %ntid, 1
+  %2 = xor i32 %0, -1
+  %master_tid = and i32 %1, %2
+  %3 = icmp eq i32 %tid, %master_tid
+  br i1 %3, label %master, label %exit
+master:
+  %y = call i8* @__kmpc_alloc_shared(i64 4), !dbg !10
+  %y_on_stack = bitcast i8* %y to [4 x i32]*
+  %4 = bitcast [4 x i32]* %y_on_stack to i8*
+  call void @use(i8* %4)
+  call void @__kmpc_free_shared(i8* %y)
+  br label %exit
+exit:
+  ret void
+}
+
+
+define void @use(i8* %x) {
+entry:
+  %addr = alloca i8*
+  store i8* %x, i8** %addr
+  ret void
+}
+
+declare i8* @__kmpc_alloc_shared(i64)
+
+declare void @__kmpc_free_shared(i8*)
+
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!nvvm.annotations = !{!5, !6}
+
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "replace_globalization.c", directory: "/tmp/replace_globalization.c")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{void ()* @foo, !"kernel", i32 1}
+!6 = !{void ()* @bar, !"kernel", i32 1}
+!7 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 5, column: 7, scope: !7)
+!10 = !DILocation(line: 5, column: 14, scope: !7)