[Openmp-commits] [openmp] cb17c48 - [Attributor] Identify and remove no-op fences

Johannes Doerfert via Openmp-commits openmp-commits at lists.llvm.org
Mon Jun 5 17:14:13 PDT 2023


Author: Johannes Doerfert
Date: 2023-06-05T17:14:00-07:00
New Revision: cb17c48fdd8c0f8ab6cd7ad3f197b052db20d1f6

URL: https://github.com/llvm/llvm-project/commit/cb17c48fdd8c0f8ab6cd7ad3f197b052db20d1f6
DIFF: https://github.com/llvm/llvm-project/commit/cb17c48fdd8c0f8ab6cd7ad3f197b052db20d1f6.diff

LOG: [Attributor] Identify and remove no-op fences

The logic and implementation follows the removal of no-op barriers. If
the fence is not making updates visible, either to the world or the
current thread, it is not needed. Said differently, the fences we remove
do not establish synchronization (happens-before) edges.
This allows us to eliminate some of the regression caused by:
  https://reviews.llvm.org/D145290

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/IPO/Attributor.h
    llvm/lib/Transforms/IPO/AttributorAttributes.cpp
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/OpenMP/barrier_removal.ll
    openmp/libomptarget/test/jit/empty_kernel_lvl2.c

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 4aec1f51b6323..90ed050dde9b7 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -119,6 +119,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Alignment.h"
@@ -5178,6 +5179,10 @@ struct AAExecutionDomain
   getExecutionDomain(const CallBase &CB) const = 0;
   virtual ExecutionDomainTy getFunctionExecutionDomain() const = 0;
 
+  /// Helper function to determine if \p FI is a no-op given the information
+  /// about its execution from \p ExecDomainAA.
+  virtual bool isNoOpFence(const FenceInst &FI) const = 0;
+
   /// This function should return true if the type of the \p AA is
   /// AAExecutionDomain.
   static bool classof(const AbstractAttribute *AA) {

diff  --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index b36cb2e88b227..2a7dcf883ea01 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -4339,13 +4339,22 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
 
     Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
     if (!isAssumedSideEffectFree(A, I)) {
-      if (!isa_and_nonnull<StoreInst>(I))
+      if (!isa_and_nonnull<StoreInst>(I) && !isa_and_nonnull<FenceInst>(I))
         indicatePessimisticFixpoint();
       else
         removeAssumedBits(HAS_NO_EFFECT);
     }
   }
 
+  bool isDeadFence(Attributor &A, FenceInst &FI) {
+    const auto *ExecDomainAA = A.lookupAAFor<AAExecutionDomain>(
+        IRPosition::function(*FI.getFunction()), *this, DepClassTy::NONE);
+    if (!ExecDomainAA || !ExecDomainAA->isNoOpFence(FI))
+      return false;
+    A.recordDependence(*ExecDomainAA, *this, DepClassTy::OPTIONAL);
+    return true;
+  }
+
   bool isDeadStore(Attributor &A, StoreInst &SI,
                    SmallSetVector<Instruction *, 8> *AssumeOnlyInst = nullptr) {
     // Lang ref now states volatile store is not UB/dead, let's skip them.
@@ -4399,6 +4408,9 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
     if (isa_and_nonnull<StoreInst>(I))
       if (isValidState())
         return "assumed-dead-store";
+    if (isa_and_nonnull<FenceInst>(I))
+      if (isValidState())
+        return "assumed-dead-fence";
     return AAIsDeadValueImpl::getAsStr();
   }
 
@@ -4408,6 +4420,9 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
     if (auto *SI = dyn_cast_or_null<StoreInst>(I)) {
       if (!isDeadStore(A, *SI))
         return indicatePessimisticFixpoint();
+    } else if (auto *FI = dyn_cast_or_null<FenceInst>(I)) {
+      if (!isDeadFence(A, *FI))
+        return indicatePessimisticFixpoint();
     } else {
       if (!isAssumedSideEffectFree(A, I))
         return indicatePessimisticFixpoint();
@@ -4443,6 +4458,11 @@ struct AAIsDeadFloating : public AAIsDeadValueImpl {
         }
         return ChangeStatus::CHANGED;
       }
+      if (auto *FI = dyn_cast<FenceInst>(I)) {
+        assert(isDeadFence(A, *FI));
+        A.deleteAfterManifest(*FI);
+        return ChangeStatus::CHANGED;
+      }
       if (isAssumedSideEffectFree(A, I) && !isa<InvokeInst>(I)) {
         A.deleteAfterManifest(*I);
         return ChangeStatus::CHANGED;

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 1e86167d570c0..d23efb8193827 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/EnumeratedArray.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
@@ -46,6 +47,7 @@
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/IPO/Attributor.h"
@@ -2641,6 +2643,10 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
     return Changed;
   }
 
+  bool isNoOpFence(const FenceInst &FI) const override {
+    return getState().isValidState() && !NonNoOpFences.count(&FI);
+  }
+
   /// Merge barrier and assumption information from \p PredED into the successor
   /// \p ED.
   void
@@ -2811,6 +2817,10 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
     R = V;
     return !Eq;
   }
+
+  /// Collection of fences known to be non-no-opt. All fences not in this set
+  /// can be assumed no-opt.
+  SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
 };
 
 void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
@@ -2980,6 +2990,33 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
           continue;
       }
 
+      if (auto *FI = dyn_cast<FenceInst>(&I)) {
+        if (!ED.EncounteredNonLocalSideEffect) {
+          // An aligned fence without non-local side-effects is a no-op.
+          if (ED.IsReachedFromAlignedBarrierOnly)
+            continue;
+          // A non-aligned fence without non-local side-effects is a no-op
+          // if the ordering only publishes non-local side-effects (or less).
+          switch (FI->getOrdering()) {
+          case AtomicOrdering::NotAtomic:
+            continue;
+          case AtomicOrdering::Unordered:
+            continue;
+          case AtomicOrdering::Monotonic:
+            continue;
+          case AtomicOrdering::Acquire:
+            break;
+          case AtomicOrdering::Release:
+            continue;
+          case AtomicOrdering::AcquireRelease:
+            break;
+          case AtomicOrdering::SequentiallyConsistent:
+            break;
+          };
+        }
+        NonNoOpFences.insert(FI);
+      }
+
       auto *CB = dyn_cast<CallBase>(&I);
       bool IsNoSync = AA::isNoSyncInst(A, I, *this);
       bool IsAlignedBarrier =
@@ -5206,6 +5243,10 @@ void OpenMPOpt::registerAAsForFunction(Attributor &A, const Function &F) {
       A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
       continue;
     }
+    if (auto *FI = dyn_cast<FenceInst>(&I)) {
+      A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*FI));
+      continue;
+    }
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
       if (II->getIntrinsicID() == Intrinsic::assume) {
         A.getOrCreateAAFor<AAPotentialValues>(

diff  --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index 8ac82f0975d0f..9b092ba408bde 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -111,6 +111,7 @@ define void @pos_empty_8(i1 %c) "kernel" {
 ;
   br i1 %c, label %t, label %f
 t:
+  fence release
   call void @llvm.amdgcn.s.barrier() "llvm.assume"="ompx_aligned_barrier"
   br label %f
 f:
@@ -133,23 +134,33 @@ define void @neg_empty_9(i1 %c) "kernel" {
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
 ; CHECK:       t:
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    fence release
 ; CHECK-NEXT:    br label [[M:%.*]]
 ; CHECK:       f:
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    fence release
 ; CHECK-NEXT:    br label [[M]]
 ; CHECK:       m:
+; CHECK-NEXT:    fence release
 ; CHECK-NEXT:    call void @llvm.amdgcn.s.barrier()
+; CHECK-NEXT:    fence release
 ; CHECK-NEXT:    ret void
 ;
   br i1 %c, label %t, label %f
 t:
+  fence release
   call void @llvm.amdgcn.s.barrier()
+  fence release
   br label %m
 f:
+  fence release
   call void @llvm.amdgcn.s.barrier()
+  fence release
   br label %m
 m:
+  fence release
   call void @llvm.amdgcn.s.barrier()
+  fence release
   ret void
 }
 ; FIXME: We should remove the barrier
@@ -345,21 +356,27 @@ define void @neg_mem() "kernel" {
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    [[ARG:%.*]] = load ptr, ptr @GPtr, align 8
 ; CHECK-NEXT:    [[A:%.*]] = load i32, ptr @G1, align 4
+; CHECK-NEXT:    fence seq_cst
 ; CHECK-NEXT:    call void @aligned_barrier()
 ; CHECK-NEXT:    store i32 [[A]], ptr [[ARG]], align 4
+; CHECK-NEXT:    fence release
 ; CHECK-NEXT:    call void @aligned_barrier()
 ; CHECK-NEXT:    [[B:%.*]] = load i32, ptr addrspacecast (ptr addrspace(1) @G2 to ptr), align 4
 ; CHECK-NEXT:    store i32 [[B]], ptr @G1, align 4
+; CHECK-NEXT:    fence acquire
 ; CHECK-NEXT:    ret void
 ;
   %arg = load ptr, ptr @GPtr
   %a = load i32, ptr @G1
+  fence seq_cst
   call void @aligned_barrier()
   store i32 %a, ptr %arg
+  fence release
   call void @aligned_barrier()
   %G2c = addrspacecast ptr addrspace(1) @G2 to ptr
   %b = load i32, ptr %G2c
   store i32 %b, ptr @G1
+  fence acquire
   call void @aligned_barrier()
   ret void
 }
@@ -397,27 +414,43 @@ define void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" {
 ; CHECK:       m:
 ; CHECK-NEXT:    ret void
 ;
+  fence acquire
   call void @llvm.nvvm.barrier0()
+  fence release
   call void @aligned_barrier()
+  fence seq_cst
   br i1 %c0, label %t0, label %f0
 t0:
+  fence seq_cst
   call void @aligned_barrier()
+  fence seq_cst
   br label %t0b
 t0b:
+  fence seq_cst
   call void @aligned_barrier()
+  fence seq_cst
   br label %m
 f0:
+  fence release
   call void @aligned_barrier()
+  fence acquire
   call void @llvm.nvvm.barrier0()
+  fence acquire
   br i1 %c1, label %t1, label %f1
 t1:
+  fence acquire
   call void @aligned_barrier()
+  fence seq_cst
   br label %m
 f1:
+  fence seq_cst
   call void @aligned_barrier()
+  fence acquire
   br label %m
 m:
+  fence seq_cst
   call void @aligned_barrier()
+  fence seq_cst
   ret void
 }
 

diff  --git a/openmp/libomptarget/test/jit/empty_kernel_lvl2.c b/openmp/libomptarget/test/jit/empty_kernel_lvl2.c
index 58181043e1bab..01270cf041de1 100644
--- a/openmp/libomptarget/test/jit/empty_kernel_lvl2.c
+++ b/openmp/libomptarget/test/jit/empty_kernel_lvl2.c
@@ -5,8 +5,7 @@
 // RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
 // RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
 // RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefix=FIRST
+// RUN: %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefix=FIRST
 // RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
 // RUN:     -DTGT1_DIRECTIVE="target"                          \
 // RUN:     -DTGT2_DIRECTIVE="target"                          \
@@ -14,8 +13,7 @@
 // RUN: env LIBOMPTARGET_JIT_PRE_OPT_IR_MODULE=%t.pre.ll     \
 // RUN:     LIBOMPTARGET_JIT_SKIP_OPT=true                   \
 // RUN:     %libomptarget-run-generic
-// TODO:
-// RUN: not %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
+// RUN: %fcheck-plain-generic --input-file %t.pre.ll %S/empty_kernel.inc --check-prefixes=FIRST,SECOND
 //
 // RUN: %libomptarget-compileoptxx-generic -fopenmp-target-jit \
 // RUN:     -DTGT1_DIRECTIVE="target"                          \


        


More information about the Openmp-commits mailing list