[llvm] 5238df7 - [Attributor] Allow (inter-procedural) "CFG" reasoning for aligned regions

Mon Jan 23 22:48:02 PST 2023

Author: Johannes Doerfert
Date: 2023-01-23T22:45:48-08:00
New Revision: 5238df7ed594712713aff9880354e67b05ac16c4

URL: https://github.com/llvm/llvm-project/commit/5238df7ed594712713aff9880354e67b05ac16c4
DIFF: https://github.com/llvm/llvm-project/commit/5238df7ed594712713aff9880354e67b05ac16c4.diff

LOG: [Attributor] Allow (inter-procedural) "CFG" reasoning for aligned regions

If an instruction is executed in an aligned region we can ignore
threading effects and use CFG reasoning (dominance and reachability).
This is true because all threads are together in an aligned region and
there cannot be one waiting for a signal at a place not connected via
the control flow.

More dedicated tests will follow.

More details can be found here:
"Co-Designing an OpenMP GPU Runtime and Optimizations for Near-Zero
Overhead Execution", IPDPS 2022,
https://www.osti.gov/servlets/purl/1890094

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/IPO/Attributor.h
    llvm/lib/Transforms/IPO/AttributorAttributes.cpp
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index d27da1048a625..da171f8940749 100644

--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -5047,6 +5047,12 @@ struct AAExecutionDomain
   /// Check if a basic block is executed only by the initial thread.
   virtual bool isExecutedByInitialThreadOnly(const BasicBlock &) const = 0;
 
+  /// Check if the instruction \p I is executed in an aligned region, that is,
+  /// the synchronizing effects before and after \p I are both aligned barriers.
+  /// This effectively means all threads execute \p I together.
+  virtual bool isExecutedInAlignedRegion(Attributor &A,
+                                         const Instruction &I) const = 0;
+
   virtual ExecutionDomainTy getExecutionDomain(const BasicBlock &) const = 0;
   virtual ExecutionDomainTy getExecutionDomain(const CallBase &) const = 0;
   virtual ExecutionDomainTy getFunctionExecutionDomain() const = 0;

diff  --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 0dd38b566776a..001ef55ba4722 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1045,6 +1045,10 @@ struct AAPointerInfoImpl
     const auto *ExecDomainAA = A.lookupAAFor<AAExecutionDomain>(
         IRPosition::function(Scope), &QueryingAA, DepClassTy::OPTIONAL);
     bool AllInSameNoSyncFn = NoSyncAA.isAssumedNoSync();
+    bool InstIsExecutedByInitialThreadOnly =
+        ExecDomainAA && ExecDomainAA->isExecutedByInitialThreadOnly(I);
+    bool InstIsExecutedInAlignedRegion =
+        ExecDomainAA && ExecDomainAA->isExecutedInAlignedRegion(A, I);
 
     InformationCache &InfoCache = A.getInfoCache();
     bool IsThreadLocalObj =
@@ -1054,10 +1058,18 @@ struct AAPointerInfoImpl
     // right now. However, if the function is (assumed) nosync or the thread
     // executing all instructions is the main thread only we can ignore
     // threading. Also, thread-local objects do not require threading reasoning.
-    auto CanIgnoreThreading = [&](const Instruction &I) -> bool {
-      if (IsThreadLocalObj)
+    // Finally, we can ignore threading if either access is executed in an
+    // aligned region.
+    auto CanIgnoreThreadingForInst = [&](const Instruction &I) -> bool {
+      if (IsThreadLocalObj || AllInSameNoSyncFn)
         return true;
-      if (ExecDomainAA && ExecDomainAA->isExecutedByInitialThreadOnly(I))
+      if (!ExecDomainAA)
+        return false;
+      if (InstIsExecutedInAlignedRegion ||
+          ExecDomainAA->isExecutedInAlignedRegion(A, I))
+        return true;
+      if (InstIsExecutedByInitialThreadOnly &&
+          ExecDomainAA->isExecutedByInitialThreadOnly(I))
         return true;
       return false;
     };
@@ -1065,8 +1077,10 @@ struct AAPointerInfoImpl
     // Helper to determine if the access is executed by the same thread as the
     // given instruction, for now it is sufficient to avoid any potential
     // threading effects as we cannot deal with them anyway.
-    auto IsSameThreadAsInst = [&](const Access &Acc) -> bool {
-      return AllInSameNoSyncFn || CanIgnoreThreading(*Acc.getLocalInst());
+    auto CanIgnoreThreading = [&](const Access &Acc) -> bool {
+      return CanIgnoreThreadingForInst(*Acc.getRemoteInst()) ||
+             (Acc.getRemoteInst() != Acc.getLocalInst() &&
+              CanIgnoreThreadingForInst(*Acc.getLocalInst()));
     };
 
     // TODO: Use inter-procedural reachability and dominance.
@@ -1168,7 +1182,7 @@ struct AAPointerInfoImpl
 
     // Helper to determine if we can skip a specific write access.
     auto CanSkipAccess = [&](const Access &Acc, bool Exact) {
-      if (!IsSameThreadAsInst(Acc))
+      if (!CanIgnoreThreading(Acc))
         return false;
 
       // Check read (RAW) dependences and write (WAR) dependences as necessary.
@@ -1236,7 +1250,7 @@ struct AAPointerInfoImpl
     // Run the user callback on all accesses we cannot skip and return if
     // that succeeded for all or not.
     for (auto &It : InterferingAccesses) {
-      if ((!AllInSameNoSyncFn && !IsThreadLocalObj) ||
+      if ((!AllInSameNoSyncFn && !IsThreadLocalObj && !ExecDomainAA) ||
           !CanSkipAccess(*It.first, It.second)) {
         if (!UserCB(*It.first, It.second))
           return false;

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 09178a2317367..bee154dab10fe 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2654,6 +2654,68 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
     return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
   }
 
+  bool isExecutedInAlignedRegion(Attributor &A,
+                                 const Instruction &I) const override {
+    if (!isValidState() || isa<CallBase>(I))
+      return false;
+
+    const Instruction *CurI;
+
+    // Check forward until a call or the block end is reached.
+    CurI = &I;
+    do {
+      auto *CB = dyn_cast<CallBase>(CurI);
+      if (!CB)
+        continue;
+      const auto &It = CEDMap.find(CB);
+      if (It == CEDMap.end())
+        continue;
+      if (!It->getSecond().IsReachedFromAlignedBarrierOnly)
+        return false;
+    } while ((CurI = CurI->getNextNonDebugInstruction()));
+
+    if (!CurI && !BEDMap.lookup(I.getParent()).IsReachedFromAlignedBarrierOnly)
+      return false;
+
+    // Check backward until a call or the block beginning is reached.
+    CurI = &I;
+    do {
+      auto *CB = dyn_cast<CallBase>(CurI);
+      if (!CB)
+        continue;
+      const auto &It = CEDMap.find(CB);
+      if (It == CEDMap.end())
+        continue;
+      if (!AA::isNoSyncInst(A, *CB, *this)) {
+        if (It->getSecond().IsReachedFromAlignedBarrierOnly)
+          break;
+        return false;
+      }
+
+      Function *Callee = CB->getCalledFunction();
+      if (!Callee || Callee->isDeclaration())
+        return false;
+      const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+          *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
+      if (!EDAA.getState().isValidState())
+        return false;
+      if (!EDAA.getFunctionExecutionDomain().IsReachedFromAlignedBarrierOnly)
+        return false;
+      break;
+    } while ((CurI = CurI->getPrevNonDebugInstruction()));
+
+    if (!CurI &&
+        !llvm::all_of(
+            predecessors(I.getParent()), [&](const BasicBlock *PredBB) {
+              return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
+            })) {
+      return false;
+    }
+
+    // On neither traversal we found a anything but aligned barriers.
+    return true;
+  }
+
   ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override {
     assert(isValidState() &&
            "No request should be made against an invalid state!");

diff  --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
index 008e8a2e565df..ca9905af23cd6 100644
--- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
+++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
@@ -7,15 +7,18 @@ target triple = "amdgcn-amd-amdhsa"
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 
 @G = internal addrspace(3) global i32 undef, align 4
+ at H = internal addrspace(3) global i32 undef, align 4
 @str = private unnamed_addr addrspace(4) constant [1 x i8] c"\00", align 1
 
 ; Make sure we do not delete the stores to @G without also replacing the load with `1`.
 ;.
 ; TUNIT: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
+; TUNIT: @[[H:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
 ; TUNIT: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
 ; TUNIT: @[[KERNEL_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
 ;.
 ; CGSCC: @[[G:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
+; CGSCC: @[[H:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
 ; CGSCC: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
 ;.
 define void @kernel() "kernel" {
@@ -33,8 +36,10 @@ define void @kernel() "kernel" {
 ; CHECK-NEXT:    call void @barrier() #[[ATTR5:[0-9]+]]
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr addrspace(3) @G, align 4
 ; CHECK-NEXT:    call void @use1(i32 [[L]]) #[[ATTR5]]
+; CHECK-NEXT:    call void @barrier() #[[ATTR5]]
 ; CHECK-NEXT:    br label [[IF_MERGE]]
 ; CHECK:       if.merge:
+; CHECK-NEXT:    call void @use1(i32 2) #[[ATTR5]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN2:%.*]], label [[IF_END:%.*]]
 ; CHECK:       if.then2:
 ; CHECK-NEXT:    store i32 2, ptr addrspace(3) @G, align 4
@@ -49,13 +54,20 @@ define void @kernel() "kernel" {
   br i1 %cmp, label %if.then, label %if.else
 if.then:
   store i32 1, ptr addrspace(3) @G
+  store i32 2, ptr addrspace(3) @H
   br label %if.merge
 if.else:
   call void @barrier();
   %l = load i32, ptr addrspace(3) @G
   call void @use1(i32 %l)
+  %hv = load i32, ptr addrspace(3) @H
+  %hc = icmp eq i32 %hv, 2
+  call void @llvm.assume(i1 %hc)
+  call void @barrier();
   br label %if.merge
 if.merge:
+  %hreload = load i32, ptr addrspace(3) @H
+  call void @use1(i32 %hreload)
   br i1 %cmp, label %if.then2, label %if.end
 if.then2:
   store i32 2, ptr addrspace(3) @G
@@ -75,7 +87,7 @@ define void @test_assume() {
   ret void
 }
 
-declare void @barrier() norecurse nounwind nocallback
+declare void @barrier() norecurse nounwind nocallback "llvm.assume"="ompx_aligned_barrier"
 declare void @use1(i32) nosync norecurse nounwind nocallback
 declare i32 @__kmpc_target_init(ptr, i8, i1) nocallback
 declare void @__kmpc_target_deinit(ptr, i8) nocallback
@@ -90,7 +102,7 @@ declare void @llvm.assume(i1)
 
 ;.
 ; CHECK: attributes #[[ATTR0]] = { norecurse "kernel" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback norecurse nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback norecurse nounwind "llvm.assume"="ompx_aligned_barrier" }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback norecurse nosync nounwind }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }