[llvm] 129faec - [OpenMP] Identify non-aligned barriers executed in an aligned context

Johannes Doerfert via llvm-commits llvm-commits at lists.llvm.org
Sun Jan 22 21:42:27 PST 2023


Author: Johannes Doerfert
Date: 2023-01-22T21:42:07-08:00
New Revision: 129faec711693ac05b0dbf97c7e84e83dc0bbb88

URL: https://github.com/llvm/llvm-project/commit/129faec711693ac05b0dbf97c7e84e83dc0bbb88
DIFF: https://github.com/llvm/llvm-project/commit/129faec711693ac05b0dbf97c7e84e83dc0bbb88.diff

LOG: [OpenMP] Identify non-aligned barriers executed in an aligned context

Even if a barrier does not enforce aligned execution, it will
effectively be like an aligned barrier if it is executed by all threads
in an aligned way. We lack control flow divergence analysis here so we
can only do (basic block) local reasoning for now.

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/IPO/Attributor.h
    llvm/lib/Transforms/IPO/AttributorAttributes.cpp
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/barrier_removal.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 7b747df5498b2..d27da1048a625 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -3332,9 +3332,12 @@ struct AANoSync
   /// Helper function specific for intrinsics which are potentially volatile.
   static bool isNoSyncIntrinsic(const Instruction *I);
 
-  /// Helper function to determine if \p CB is an aligned (GPU) barrier.
-  /// Aligned barriers have to be executed by all threads.
-  static bool isAlignedBarrier(const CallBase &CB);
+  /// Helper function to determine if \p CB is an aligned (GPU) barrier. Aligned
+  /// barriers have to be executed by all threads. The flag \p ExecutedAligned
+  /// indicates if the call is executed by all threads in a (thread) block in an
+  /// aligned way. If that is the case, non-aligned barriers are effectively
+  /// aligned barriers.
+  static bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned);
 
   /// Create an abstract attribute view for the position \p IRP.
   static AANoSync &createForPosition(const IRPosition &IRP, Attributor &A);

diff  --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 6330740954ec2..cae410926fe1d 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -2226,14 +2226,15 @@ struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
 
 /// ------------------------ NoSync Function Attribute -------------------------
 
-bool AANoSync::isAlignedBarrier(const CallBase &CB) {
+bool AANoSync::isAlignedBarrier(const CallBase &CB, bool ExecutedAligned) {
   switch (CB.getIntrinsicID()) {
   case Intrinsic::nvvm_barrier0:
   case Intrinsic::nvvm_barrier0_and:
   case Intrinsic::nvvm_barrier0_or:
   case Intrinsic::nvvm_barrier0_popc:
     return true;
-  // TODO: Check for amdgcn_s_barrier executed in a uniform/aligned way.
+  case Intrinsic::amdgcn_s_barrier:
+    return ExecutedAligned;
   default:
     break;
   }

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index dee2ee2bb3ead..a3ed1685773e1 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -2827,17 +2827,23 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
 
   Function *F = getAnchorScope();
   BasicBlock &EntryBB = F->getEntryBlock();
+  bool IsKernel = OMPInfoCache.Kernels.count(F);
 
   SmallVector<Instruction *> SyncInstWorklist;
   for (auto &RIt : *RPOT) {
     BasicBlock &BB = *RIt;
 
+    bool IsEntryBB = &BB == &EntryBB;
+    // TODO: We use local reasoning since we don't have a divergence analysis
+    // 	     running as well. We could basically allow uniform branches here.
+    bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
     ExecutionDomainTy ED;
     // Propagate "incoming edges" into information about this block.
-    if (&BB == &EntryBB) {
+    if (IsEntryBB) {
       handleEntryBB(A, ED);
     } else {
-      // For live non-entry blocks we only propagate information via live edges.
+      // For live non-entry blocks we only propagate
+      // information via live edges.
       if (LivenessAA.isAssumedDead(&BB))
         continue;
 
@@ -2874,7 +2880,10 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
       auto *CB = dyn_cast<CallBase>(&I);
       bool IsNoSync = AA::isNoSyncInst(A, I, *this);
       bool IsAlignedBarrier =
-          !IsNoSync && CB && AANoSync::isAlignedBarrier(*CB);
+          !IsNoSync && CB &&
+          AANoSync::isAlignedBarrier(*CB, AlignedBarrierLastInBlock);
+
+      AlignedBarrierLastInBlock &= IsNoSync;
 
       // Next we check for calls. Aligned barriers are handled
       // explicitly, everything else is kept for the backward traversal and will
@@ -2882,6 +2891,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
       if (CB) {
         if (IsAlignedBarrier) {
           HandleAlignedBarrier(CB, ED);
+          AlignedBarrierLastInBlock = true;
           continue;
         }
 
@@ -2913,6 +2923,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
             const auto &CalleeED = EDAA.getFunctionExecutionDomain();
             ED.IsReachedFromAlignedBarrierOnly =
                 CalleeED.IsReachedFromAlignedBarrierOnly;
+            AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
             if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
               ED.EncounteredNonLocalSideEffect |=
                   CalleeED.EncounteredNonLocalSideEffect;
@@ -2928,6 +2939,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
         }
         ED.IsReachedFromAlignedBarrierOnly =
             IsNoSync && ED.IsReachedFromAlignedBarrierOnly;
+        AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
         ED.EncounteredNonLocalSideEffect |= true;
         if (!IsNoSync)
           SyncInstWorklist.push_back(&I);
@@ -2971,7 +2983,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
       auto &FnED = BEDMap[nullptr];
       mergeInPredecessor(A, FnED, ED);
 
-      if (OMPInfoCache.Kernels.count(F))
+      if (IsKernel)
         HandleAlignedBarrier(nullptr, ED);
     }
 

diff  --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index fc6bcf9391e7c..0e294abef9f01 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -70,11 +70,131 @@ define void @pos_empty_6() {
   call i32 @llvm.nvvm.barrier0.popc(i32 0)
   ret void
 }
-define void @neg_empty_7() {
-; CHECK-LABEL: define {{[^@]+}}@neg_empty_7() {
+define void @pos_empty_7a() {
+; CHECK-LABEL: define {{[^@]+}}@pos_empty_7a() {
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.amdgcn.s.barrier()
+  call void @unknown()
+  ret void
+}
+; FIXME: We should remove the barrier.
+define void @pos_empty_7b() {
+; CHECK-LABEL: define {{[^@]+}}@pos_empty_7b() {
+; CHECK-NEXT:    call void @unknown() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    call void @unknown()
+; CHECK-NEXT:    ret void
+;
+  call void @unknown() nosync readnone
+  call void @llvm.amdgcn.s.barrier()
+  call void @unknown()
+  ret void
+}
+define void @neg_empty_8() {
+; CHECK-LABEL: define {{[^@]+}}@neg_empty_8() {
+; CHECK-NEXT:    call void @unknown()
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    ret void
 ;
+  call void @unknown()
+  call void @llvm.amdgcn.s.barrier()
+  ret void
+}
+define void @neg_empty_9(i1 %c) {
+; CHECK-LABEL: define {{[^@]+}}@neg_empty_9
+; CHECK-SAME: (i1 [[C:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    br label [[M:%.*]]
+; CHECK:       f:
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       m:
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %t, label %f
+t:
+  call void @llvm.amdgcn.s.barrier()
+  br label %m
+f:
+  call void @llvm.amdgcn.s.barrier()
+  br label %m
+m:
+  call void @llvm.amdgcn.s.barrier()
+  ret void
+}
+; FIXME: We should remove the barrier
+define void @pos_empty_10() {
+; CHECK-LABEL: define {{[^@]+}}@pos_empty_10() {
+; CHECK-NEXT:    br label [[M:%.*]]
+; CHECK:       m:
+; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    ret void
+;
+  br label %m
+m:
+  call void @llvm.amdgcn.s.barrier()
+  ret void
+}
+define void @pos_empty_11() {
+; CHECK-LABEL: define {{[^@]+}}@pos_empty_11() {
+; CHECK-NEXT:    br label [[M:%.*]]
+; CHECK:       m:
+; CHECK-NEXT:    ret void
+;
+  br label %m
+m:
+  call void @aligned_barrier()
+  call void @llvm.amdgcn.s.barrier()
+  ret void
+}
+define void @empty() {
+; CHECK-LABEL: define {{[^@]+}}@empty() {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+; FIXME: We should remove the barrier in the end but not the first one.
+define void @neg_empty_12(i1 %c) {
+; MODULE-LABEL: define {{[^@]+}}@neg_empty_12
+; MODULE-SAME: (i1 [[C:%.*]]) {
+; MODULE-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; MODULE:       t:
+; MODULE-NEXT:    call void @llvm.amdgcn.s.barrier()
+; MODULE-NEXT:    br label [[M:%.*]]
+; MODULE:       f:
+; MODULE-NEXT:    br label [[M]]
+; MODULE:       m:
+; MODULE-NEXT:    call void @llvm.amdgcn.s.barrier()
+; MODULE-NEXT:    ret void
+;
+; CGSCC-LABEL: define {{[^@]+}}@neg_empty_12
+; CGSCC-SAME: (i1 [[C:%.*]]) {
+; CGSCC-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
+; CGSCC:       t:
+; CGSCC-NEXT:    call void @empty()
+; CGSCC-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CGSCC-NEXT:    br label [[M:%.*]]
+; CGSCC:       f:
+; CGSCC-NEXT:    call void @empty()
+; CGSCC-NEXT:    br label [[M]]
+; CGSCC:       m:
+; CGSCC-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CGSCC-NEXT:    ret void
+;
+  br i1 %c, label %t, label %f
+t:
+  call void @empty()
+  call void @llvm.amdgcn.s.barrier()
+  br label %m
+f:
+  call void @empty()
+  br label %m
+m:
   call void @llvm.amdgcn.s.barrier()
   ret void
 }
@@ -214,7 +334,6 @@ define void @neg_mem() {
 
 define void @pos_multiple() {
 ; CHECK-LABEL: define {{[^@]+}}@pos_multiple() {
-; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.nvvm.barrier0()
@@ -846,7 +965,7 @@ m3:
 }
 
 !llvm.module.flags = !{!16,!15}
-!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14}
+!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22}
 
 !0 = !{void ()* @pos_empty_1, !"kernel", i32 1}
 !1 = !{void ()* @pos_empty_2, !"kernel", i32 1}
@@ -854,7 +973,13 @@ m3:
 !3 = !{void ()* @pos_empty_4, !"kernel", i32 1}
 !4 = !{void ()* @pos_empty_5, !"kernel", i32 1}
 !5 = !{void ()* @pos_empty_6, !"kernel", i32 1}
-!6 = !{void ()* @neg_empty_7, !"kernel", i32 1}
+!17 = !{void ()* @pos_empty_7a, !"kernel", i32 1}
+!18 = !{void ()* @pos_empty_7b, !"kernel", i32 1}
+!6 = !{void ()* @neg_empty_8, !"kernel", i32 1}
+!19 = !{void (i1)* @neg_empty_9, !"kernel", i32 1}
+!20 = !{void ()* @pos_empty_10, !"kernel", i32 1}
+!21 = !{void ()* @pos_empty_11, !"kernel", i32 1}
+!22 = !{void (i1)* @neg_empty_12, !"kernel", i32 1}
 !7 = !{void ()* @pos_constant_loads, !"kernel", i32 1}
 !8 = !{void ()* @neg_loads, !"kernel", i32 1}
 !9 = !{void ()* @pos_priv_mem, !"kernel", i32 1}
@@ -870,6 +995,7 @@ m3:
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nounwind }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+; CHECK: attributes #[[ATTR4]] = { nosync memory(none) }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
@@ -879,7 +1005,7 @@ m3:
 ; CHECK: [[META5:![0-9]+]] = !{ptr @pos_empty_4, !"kernel", i32 1}
 ; CHECK: [[META6:![0-9]+]] = !{ptr @pos_empty_5, !"kernel", i32 1}
 ; CHECK: [[META7:![0-9]+]] = !{ptr @pos_empty_6, !"kernel", i32 1}
-; CHECK: [[META8:![0-9]+]] = !{ptr @neg_empty_7, !"kernel", i32 1}
+; CHECK: [[META8:![0-9]+]] = !{ptr @neg_empty_8, !"kernel", i32 1}
 ; CHECK: [[META9:![0-9]+]] = !{ptr @pos_constant_loads, !"kernel", i32 1}
 ; CHECK: [[META10:![0-9]+]] = !{ptr @neg_loads, !"kernel", i32 1}
 ; CHECK: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1}
@@ -888,4 +1014,10 @@ m3:
 ; CHECK: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1}
 ; CHECK: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1}
 ; CHECK: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
+; CHECK: [[META17:![0-9]+]] = !{ptr @pos_empty_7a, !"kernel", i32 1}
+; CHECK: [[META18:![0-9]+]] = !{ptr @pos_empty_7b, !"kernel", i32 1}
+; CHECK: [[META19:![0-9]+]] = !{ptr @neg_empty_9, !"kernel", i32 1}
+; CHECK: [[META20:![0-9]+]] = !{ptr @pos_empty_10, !"kernel", i32 1}
+; CHECK: [[META21:![0-9]+]] = !{ptr @pos_empty_11, !"kernel", i32 1}
+; CHECK: [[META22:![0-9]+]] = !{ptr @neg_empty_12, !"kernel", i32 1}
 ;.


        


More information about the llvm-commits mailing list