[llvm] 43c1c59 - [OpenMP] Merge barrier elimination into AAExecutionDomain

Johannes Doerfert via llvm-commits llvm-commits at lists.llvm.org
Sun Jan 22 16:35:37 PST 2023


Author: Johannes Doerfert
Date: 2023-01-22T16:34:59-08:00
New Revision: 43c1c59f7388da6cc0687095aa93497f3d1d510d

URL: https://github.com/llvm/llvm-project/commit/43c1c59f7388da6cc0687095aa93497f3d1d510d
DIFF: https://github.com/llvm/llvm-project/commit/43c1c59f7388da6cc0687095aa93497f3d1d510d.diff

LOG: [OpenMP] Merge barrier elimination into AAExecutionDomain

With this patch we track aligned barriers in AAExecutionDomain and also
delete unnecessary barriers there. This allows us to eliminate barriers
across blocks, across functions, and in the presence of complex accesses
that do not force a barrier. Further, we can use the collected
information to enable store-load forwarding in a threaded environment
(follow up patch).

Differential Revision: https://reviews.llvm.org/D140463

Added: 
    

Modified: 
    llvm/include/llvm/Transforms/IPO/Attributor.h
    llvm/lib/Transforms/IPO/Attributor.cpp
    llvm/lib/Transforms/IPO/AttributorAttributes.cpp
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    llvm/test/Transforms/Attributor/reduced/openmp_opt_dont_follow_gep_without_value.ll
    llvm/test/Transforms/OpenMP/barrier_removal.ll
    llvm/test/Transforms/OpenMP/deduplication_target.ll
    llvm/test/Transforms/OpenMP/heap-to-shared-missing-declarations.ll
    llvm/test/Transforms/OpenMP/internals_cgscc.ll
    llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll
    llvm/test/Transforms/OpenMP/remove_globalization.ll
    llvm/test/Transforms/OpenMP/replace_globalization.ll
    llvm/test/Transforms/OpenMP/spmdization.ll
    llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
    llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
    llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 986004f63ef08..7b747df5498b2 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -110,6 +110,7 @@
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -376,6 +377,12 @@ bool isPotentiallyReachable(
 bool isAssumedThreadLocalObject(Attributor &A, Value &Obj,
                                 const AbstractAttribute &QueryingAA);
 
+/// Return true if \p I is potentially affected by a barrier.
+bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I,
+                                    const AbstractAttribute &QueryingAA);
+bool isPotentiallyAffectedByBarrier(Attributor &A, ArrayRef<const Value *> Ptrs,
+                                    const AbstractAttribute &QueryingAA,
+                                    const Instruction *CtxI);
 } // namespace AA
 
 template <>
@@ -1921,7 +1928,8 @@ struct Attributor {
   bool isAssumedDead(const Instruction &I, const AbstractAttribute *QueryingAA,
                      const AAIsDead *LivenessAA, bool &UsedAssumedInformation,
                      bool CheckBBLivenessOnly = false,
-                     DepClassTy DepClass = DepClassTy::OPTIONAL);
+                     DepClassTy DepClass = DepClassTy::OPTIONAL,
+                     bool CheckForDeadStore = false);
 
   /// Return true if \p U is assumed dead.
   ///
@@ -3324,6 +3332,10 @@ struct AANoSync
   /// Helper function specific for intrinsics which are potentially volatile.
   static bool isNoSyncIntrinsic(const Instruction *I);
 
+  /// Helper function to determine if \p CB is an aligned (GPU) barrier.
+  /// Aligned barriers have to be executed by all threads.
+  static bool isAlignedBarrier(const CallBase &CB);
+
   /// Create an abstract attribute view for the position \p IRP.
   static AANoSync &createForPosition(const IRPosition &IRP, Attributor &A);
 
@@ -3618,9 +3630,6 @@ struct AAIsDead
   /// Returns true if the underlying value is known dead.
   virtual bool isKnownDead() const = 0;
 
-  /// Returns true if \p BB is assumed dead.
-  virtual bool isAssumedDead(const BasicBlock *BB) const = 0;
-
   /// Returns true if \p BB is known dead.
   virtual bool isKnownDead(const BasicBlock *BB) const = 0;
 
@@ -3659,6 +3668,9 @@ struct AAIsDead
     return F.hasPersonalityFn() && !canSimplifyInvokeNoUnwind(&F);
   }
 
+  /// Returns true if \p BB is assumed dead.
+  virtual bool isAssumedDead(const BasicBlock *BB) const = 0;
+
   /// Return if the edge from \p From BB to \p To BB is assumed dead.
   /// This is specifically useful in AAReachability.
   virtual bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const {
@@ -4988,6 +5000,32 @@ struct AAExecutionDomain
   using Base = StateWrapper<BooleanState, AbstractAttribute>;
   AAExecutionDomain(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
 
+  /// Summary about the execution domain of a block or instruction.
+  struct ExecutionDomainTy {
+    using BarriersSetTy = SmallPtrSet<CallBase *, 2>;
+    using AssumesSetTy = SmallPtrSet<AssumeInst *, 4>;
+
+    void addAssumeInst(Attributor &A, AssumeInst &AI) {
+      EncounteredAssumes.insert(&AI);
+    }
+
+    void addAlignedBarrier(Attributor &A, CallBase &CB) {
+      AlignedBarriers.insert(&CB);
+    }
+
+    void clearAssumeInstAndAlignedBarriers() {
+      EncounteredAssumes.clear();
+      AlignedBarriers.clear();
+    }
+
+    bool IsExecutedByInitialThreadOnly = true;
+    bool IsReachedFromAlignedBarrierOnly = true;
+    bool IsReachingAlignedBarrierOnly = true;
+    bool EncounteredNonLocalSideEffect = false;
+    BarriersSetTy AlignedBarriers;
+    AssumesSetTy EncounteredAssumes;
+  };
+
   /// Create an abstract attribute view for the position \p IRP.
   static AAExecutionDomain &createForPosition(const IRPosition &IRP,
                                               Attributor &A);
@@ -4999,11 +5037,17 @@ struct AAExecutionDomain
   const char *getIdAddr() const override { return &ID; }
 
   /// Check if an instruction is executed only by the initial thread.
-  virtual bool isExecutedByInitialThreadOnly(const Instruction &) const = 0;
+  bool isExecutedByInitialThreadOnly(const Instruction &I) const {
+    return isExecutedByInitialThreadOnly(*I.getParent());
+  }
 
   /// Check if a basic block is executed only by the initial thread.
   virtual bool isExecutedByInitialThreadOnly(const BasicBlock &) const = 0;
 
+  virtual ExecutionDomainTy getExecutionDomain(const BasicBlock &) const = 0;
+  virtual ExecutionDomainTy getExecutionDomain(const CallBase &) const = 0;
+  virtual ExecutionDomainTy getFunctionExecutionDomain() const = 0;
+
   /// This function should return true if the type of the \p AA is
   /// AAExecutionDomain.
   static bool classof(const AbstractAttribute *AA) {

diff  --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 92b711658baf9..08ded81eb77f6 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -784,6 +784,61 @@ bool AA::isAssumedThreadLocalObject(Attributor &A, Value &Obj,
   return false;
 }
 
+bool AA::isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I,
+                                        const AbstractAttribute &QueryingAA) {
+  if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
+    return false;
+
+  SmallSetVector<const Value *, 8> Ptrs;
+
+  auto AddLocationPtr = [&](std::optional<MemoryLocation> Loc) {
+    if (!Loc || !Loc->Ptr) {
+      LLVM_DEBUG(
+          dbgs() << "[AA] Access to unknown location; -> requires barriers\n");
+      return false;
+    }
+    Ptrs.insert(Loc->Ptr);
+    return true;
+  };
+
+  if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&I)) {
+    if (!AddLocationPtr(MemoryLocation::getForDest(MI)))
+      return true;
+    if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(&I))
+      if (!AddLocationPtr(MemoryLocation::getForSource(MTI)))
+        return true;
+  } else if (!AddLocationPtr(MemoryLocation::getOrNone(&I)))
+    return true;
+
+  return isPotentiallyAffectedByBarrier(A, Ptrs.getArrayRef(), QueryingAA, &I);
+}
+
+bool AA::isPotentiallyAffectedByBarrier(Attributor &A,
+                                        ArrayRef<const Value *> Ptrs,
+                                        const AbstractAttribute &QueryingAA,
+                                        const Instruction *CtxI) {
+  for (const Value *Ptr : Ptrs) {
+    if (!Ptr) {
+      LLVM_DEBUG(dbgs() << "[AA] nullptr; -> requires barriers\n");
+      return true;
+    }
+
+    auto Pred = [&](Value &Obj) {
+      if (AA::isAssumedThreadLocalObject(A, Obj, QueryingAA))
+        return true;
+      LLVM_DEBUG(dbgs() << "[AA] Access to '" << Obj << "' via '" << *Ptr
+                        << "'; -> requires barrier\n");
+      return false;
+    };
+
+    const auto &UnderlyingObjsAA = A.getAAFor<AAUnderlyingObjects>(
+        QueryingAA, IRPosition::value(*Ptr), DepClassTy::OPTIONAL);
+    if (!UnderlyingObjsAA.forallUnderlyingObjects(Pred))
+      return true;
+  }
+  return false;
+}
+
 /// Return true if \p New is equal or worse than \p Old.
 static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
   if (!Old.isIntAttribute())
@@ -1349,7 +1404,8 @@ bool Attributor::isAssumedDead(const Instruction &I,
                                const AbstractAttribute *QueryingAA,
                                const AAIsDead *FnLivenessAA,
                                bool &UsedAssumedInformation,
-                               bool CheckBBLivenessOnly, DepClassTy DepClass) {
+                               bool CheckBBLivenessOnly, DepClassTy DepClass,
+                               bool CheckForDeadStore) {
   const IRPosition::CallBaseContext *CBCtx =
       QueryingAA ? QueryingAA->getCallBaseContext() : nullptr;
 
@@ -1394,6 +1450,14 @@ bool Attributor::isAssumedDead(const Instruction &I,
     return true;
   }
 
+  if (CheckForDeadStore && isa<StoreInst>(I) && IsDeadAA.isRemovableStore()) {
+    if (QueryingAA)
+      recordDependence(IsDeadAA, *QueryingAA, DepClass);
+    if (!IsDeadAA.isKnownDead())
+      UsedAssumedInformation = true;
+    return true;
+  }
+
   return false;
 }
 

diff  --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index f9a105823f72c..6330740954ec2 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -50,6 +50,8 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
@@ -2224,6 +2226,20 @@ struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
 
 /// ------------------------ NoSync Function Attribute -------------------------
 
+bool AANoSync::isAlignedBarrier(const CallBase &CB) {
+  switch (CB.getIntrinsicID()) {
+  case Intrinsic::nvvm_barrier0:
+  case Intrinsic::nvvm_barrier0_and:
+  case Intrinsic::nvvm_barrier0_or:
+  case Intrinsic::nvvm_barrier0_popc:
+    return true;
+  // TODO: Check for amdgcn_s_barrier executed in a uniform/aligned way.
+  default:
+    break;
+  }
+  return hasAssumption(CB, KnownAssumptionString("ompx_aligned_barrier"));
+}
+
 bool AANoSync::isNonRelaxedAtomic(const Instruction *I) {
   if (!I->isAtomic())
     return false;

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 67014bde7df72..95f7ce01441e1 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/EnumeratedArray.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
@@ -32,6 +33,7 @@
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/Assumptions.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/GlobalValue.h"
@@ -51,6 +53,7 @@
 
 #include <algorithm>
 #include <optional>
+#include <string>
 
 using namespace llvm;
 using namespace omp;
@@ -827,8 +830,6 @@ struct OpenMPOpt {
 
       if (remarksEnabled())
         analysisGlobalization();
-
-      Changed |= eliminateBarriers();
     } else {
       if (PrintICVValues)
         printICVs();
@@ -851,8 +852,6 @@ struct OpenMPOpt {
           Changed = true;
         }
       }
-
-      Changed |= eliminateBarriers();
     }
 
     return Changed;
@@ -1418,223 +1417,6 @@ struct OpenMPOpt {
     return Changed;
   }
 
-  /// Eliminates redundant, aligned barriers in OpenMP offloaded kernels.
-  /// TODO: Make this an AA and expand it to work across blocks and functions.
-  bool eliminateBarriers() {
-    bool Changed = false;
-
-    if (DisableOpenMPOptBarrierElimination)
-      return /*Changed=*/false;
-
-    if (OMPInfoCache.Kernels.empty())
-      return /*Changed=*/false;
-
-    enum ImplicitBarrierType { IBT_ENTRY, IBT_EXIT };
-
-    class BarrierInfo {
-      Instruction *I;
-      enum ImplicitBarrierType Type;
-
-    public:
-      BarrierInfo(enum ImplicitBarrierType Type) : I(nullptr), Type(Type) {}
-      BarrierInfo(Instruction &I) : I(&I) {}
-
-      bool isImplicit() { return !I; }
-
-      bool isImplicitEntry() { return isImplicit() && Type == IBT_ENTRY; }
-
-      bool isImplicitExit() { return isImplicit() && Type == IBT_EXIT; }
-
-      Instruction *getInstruction() { return I; }
-    };
-
-    for (Function *Kernel : OMPInfoCache.Kernels) {
-      for (BasicBlock &BB : *Kernel) {
-        SmallVector<BarrierInfo, 8> BarriersInBlock;
-        SmallPtrSet<Instruction *, 8> BarriersToBeDeleted;
-
-        // Add the kernel entry implicit barrier.
-        if (&Kernel->getEntryBlock() == &BB)
-          BarriersInBlock.push_back(IBT_ENTRY);
-
-        // Find implicit and explicit aligned barriers in the same basic block.
-        for (Instruction &I : BB) {
-          if (isa<ReturnInst>(I)) {
-            // Add the implicit barrier when exiting the kernel.
-            BarriersInBlock.push_back(IBT_EXIT);
-            continue;
-          }
-          CallBase *CB = dyn_cast<CallBase>(&I);
-          if (!CB)
-            continue;
-
-          auto IsAlignBarrierCB = [&](CallBase &CB) {
-            switch (CB.getIntrinsicID()) {
-            case Intrinsic::nvvm_barrier0:
-            case Intrinsic::nvvm_barrier0_and:
-            case Intrinsic::nvvm_barrier0_or:
-            case Intrinsic::nvvm_barrier0_popc:
-              return true;
-            default:
-              break;
-            }
-            return hasAssumption(CB,
-                                 KnownAssumptionString("ompx_aligned_barrier"));
-          };
-
-          if (IsAlignBarrierCB(*CB)) {
-            // Add an explicit aligned barrier.
-            BarriersInBlock.push_back(I);
-          }
-        }
-
-        if (BarriersInBlock.size() <= 1)
-          continue;
-
-        // A barrier in a barrier pair is removeable if all instructions
-        // between the barriers in the pair are side-effect free modulo the
-        // barrier operation.
-        auto IsBarrierRemoveable = [&Kernel](
-                                       BarrierInfo *StartBI, BarrierInfo *EndBI,
-                                       SmallVector<AssumeInst *> &Assumptions) {
-          assert(
-              !StartBI->isImplicitExit() &&
-              "Expected start barrier to be other than a kernel exit barrier");
-          assert(
-              !EndBI->isImplicitEntry() &&
-              "Expected end barrier to be other than a kernel entry barrier");
-          // If StarBI instructions is null then this the implicit
-          // kernel entry barrier, so iterate from the first instruction in the
-          // entry block.
-          Instruction *I = (StartBI->isImplicitEntry())
-                               ? &Kernel->getEntryBlock().front()
-                               : StartBI->getInstruction()->getNextNode();
-          assert(I && "Expected non-null start instruction");
-          Instruction *E = (EndBI->isImplicitExit())
-                               ? I->getParent()->getTerminator()
-                               : EndBI->getInstruction();
-          assert(E && "Expected non-null end instruction");
-
-          for (; I != E; I = I->getNextNode()) {
-            if (!I->mayHaveSideEffects() && !I->mayReadFromMemory())
-              continue;
-
-            auto IsPotentiallyAffectedByBarrier =
-                [](std::optional<MemoryLocation> Loc) {
-                  const Value *Obj = (Loc && Loc->Ptr)
-                                         ? getUnderlyingObject(Loc->Ptr)
-                                         : nullptr;
-                  if (!Obj) {
-                    LLVM_DEBUG(
-                        dbgs()
-                        << "Access to unknown location requires barriers\n");
-                    return true;
-                  }
-                  if (isa<UndefValue>(Obj))
-                    return false;
-                  if (isa<AllocaInst>(Obj))
-                    return false;
-                  if (auto *GV = dyn_cast<GlobalVariable>(Obj)) {
-                    if (GV->isConstant())
-                      return false;
-                    if (GV->isThreadLocal())
-                      return false;
-                    if (GV->getAddressSpace() == (int)AddressSpace::Local)
-                      return false;
-                    if (GV->getAddressSpace() == (int)AddressSpace::Constant)
-                      return false;
-                  }
-                  LLVM_DEBUG(dbgs() << "Access to '" << *Obj
-                                    << "' requires barriers\n");
-                  return true;
-                };
-
-            if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
-              std::optional<MemoryLocation> Loc =
-                  MemoryLocation::getForDest(MI);
-              if (IsPotentiallyAffectedByBarrier(Loc))
-                return false;
-              if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
-                std::optional<MemoryLocation> Loc =
-                    MemoryLocation::getForSource(MTI);
-                if (IsPotentiallyAffectedByBarrier(Loc))
-                  return false;
-              }
-              continue;
-            }
-
-            if (auto *AI = dyn_cast<AssumeInst>(I)) {
-              Assumptions.push_back(AI);
-              continue;
-            }
-
-            if (auto *LI = dyn_cast<LoadInst>(I))
-              if (LI->hasMetadata(LLVMContext::MD_invariant_load))
-                continue;
-
-            std::optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I);
-            if (IsPotentiallyAffectedByBarrier(Loc))
-              return false;
-          }
-
-          return true;
-        };
-
-        // Iterate barrier pairs and remove an explicit barrier if analysis
-        // deems it removeable.
-        for (auto *It = BarriersInBlock.begin(),
-                  *End = BarriersInBlock.end() - 1;
-             It != End; ++It) {
-
-          BarrierInfo *StartBI = It;
-          BarrierInfo *EndBI = (It + 1);
-
-          // Cannot remove when both are implicit barriers, continue.
-          if (StartBI->isImplicit() && EndBI->isImplicit())
-            continue;
-
-          SmallVector<AssumeInst *> Assumptions;
-          if (!IsBarrierRemoveable(StartBI, EndBI, Assumptions))
-            continue;
-
-          assert(!(StartBI->isImplicit() && EndBI->isImplicit()) &&
-                 "Expected at least one explicit barrier to remove.");
-
-          for (auto *Assumption : Assumptions)
-            Assumption->eraseFromParent();
-
-          // Remove an explicit barrier, check first, then second.
-          if (!StartBI->isImplicit()) {
-            LLVM_DEBUG(dbgs() << "Remove start barrier "
-                              << *StartBI->getInstruction() << "\n");
-            BarriersToBeDeleted.insert(StartBI->getInstruction());
-          } else {
-            LLVM_DEBUG(dbgs() << "Remove end barrier "
-                              << *EndBI->getInstruction() << "\n");
-            BarriersToBeDeleted.insert(EndBI->getInstruction());
-          }
-        }
-
-        if (BarriersToBeDeleted.empty())
-          continue;
-
-        Changed = true;
-        for (Instruction *I : BarriersToBeDeleted) {
-          ++NumBarriersEliminated;
-          auto Remark = [&](OptimizationRemark OR) {
-            return OR << "Redundant barrier eliminated.";
-          };
-
-          if (EnableVerboseRemarks)
-            emitRemark<OptimizationRemark>(I, "OMP190", Remark);
-          I->eraseFromParent();
-        }
-      }
-    }
-
-    return Changed;
-  }
-
   void analysisGlobalization() {
     auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
 
@@ -2748,77 +2530,154 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
   AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
       : AAExecutionDomain(IRP, A) {}
 
+  ~AAExecutionDomainFunction() {
+    delete RPOT;
+  }
+
+  void initialize(Attributor &A) override {
+    if (getAnchorScope()->isDeclaration()) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+    RPOT = new ReversePostOrderTraversal<Function *>(getAnchorScope());
+  }
+
   const std::string getAsStr() const override {
-    return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) +
-           "/" + std::to_string(NumBBs) + " BBs thread 0 only.";
+    unsigned TotalBlocks = 0, InitialThreadBlocks = 0;
+    for (auto &It : BEDMap) {
+      TotalBlocks++;
+      InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
+    }
+    return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) + "/" +
+           std::to_string(TotalBlocks) + " executed by initial thread only";
   }
 
   /// See AbstractAttribute::trackStatistics().
   void trackStatistics() const override {}
 
-  void initialize(Attributor &A) override {
-    Function *F = getAnchorScope();
-    for (const auto &BB : *F)
-      SingleThreadedBBs.insert(&BB);
-    NumBBs = SingleThreadedBBs.size();
-  }
-
   ChangeStatus manifest(Attributor &A) override {
     LLVM_DEBUG({
-      for (const BasicBlock *BB : SingleThreadedBBs)
+      for (const BasicBlock &BB : *getAnchorScope()) {
+        if (!isExecutedByInitialThreadOnly(BB))
+          continue;
         dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "
-               << BB->getName() << " is executed by a single thread.\n";
+               << BB.getName() << " is executed by a single thread.\n";
+      }
     });
-    return ChangeStatus::UNCHANGED;
-  }
 
-  ChangeStatus updateImpl(Attributor &A) override;
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
 
-  /// Check if an instruction is executed by a single thread.
-  bool isExecutedByInitialThreadOnly(const Instruction &I) const override {
-    return isExecutedByInitialThreadOnly(*I.getParent());
-  }
+    if (DisableOpenMPOptBarrierElimination)
+      return Changed;
 
-  bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
-    return isValidState() && SingleThreadedBBs.contains(&BB);
+    SmallPtrSet<CallBase *, 16> DeletedBarriers;
+    auto HandleAlignedBarrier = [&](CallBase *CB) {
+      const ExecutionDomainTy &ED = CEDMap[CB];
+      if (!ED.IsReachedFromAlignedBarrierOnly ||
+          ED.EncounteredNonLocalSideEffect)
+        return;
+
+      // We can remove this barrier, if it is one, or all aligned barriers
+      // reaching the kernel end. In the latter case we can transitively work
+      // our way back until we find a barrier that guards a side-effect if we
+      // are dealing with the kernel end here.
+      if (CB) {
+        DeletedBarriers.insert(CB);
+        A.deleteAfterManifest(*CB);
+        ++NumBarriersEliminated;
+        Changed = ChangeStatus::CHANGED;
+      } else if (!ED.AlignedBarriers.empty()) {
+        NumBarriersEliminated += ED.AlignedBarriers.size();
+        Changed = ChangeStatus::CHANGED;
+        SmallVector<CallBase *> Worklist(ED.AlignedBarriers.begin(),
+                                         ED.AlignedBarriers.end());
+        SmallSetVector<CallBase *, 16> Visited;
+        while (!Worklist.empty()) {
+          CallBase *LastCB = Worklist.pop_back_val();
+          if (!Visited.insert(LastCB))
+            continue;
+          if (!DeletedBarriers.count(LastCB)) {
+            A.deleteAfterManifest(*LastCB);
+            continue;
+          }
+          // The final aligned barrier (LastCB) reaching the kernel end was
+          // removed already. This means we can go one step further and remove
+          // the barriers encoutered last before (LastCB).
+          const ExecutionDomainTy &LastED = CEDMap[LastCB];
+          Worklist.append(LastED.AlignedBarriers.begin(),
+                          LastED.AlignedBarriers.end());
+        }
+      }
+
+      // If we actually eliminated a barrier we need to eliminate the associated
+      // llvm.assumes as well to avoid creating UB.
+      if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
+        for (auto *AssumeCB : ED.EncounteredAssumes)
+          A.deleteAfterManifest(*AssumeCB);
+    };
+
+    for (auto *CB : AlignedBarriers)
+      HandleAlignedBarrier(CB);
+
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    // Handle the "kernel end barrier" for kernels too.
+    if (OMPInfoCache.Kernels.count(getAnchorScope()))
+      HandleAlignedBarrier(nullptr);
+
+    return Changed;
   }
 
-  /// Set of basic blocks that are executed by a single thread.
-  SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs;
+  /// Merge barrier and assumption information from \p PredED into the successor
+  /// \p ED.
+  void
+  mergeInPredecessorBarriersAndAssumptions(Attributor &A, ExecutionDomainTy &ED,
+                                           const ExecutionDomainTy &PredED);
 
-  /// Total number of basic blocks in this function.
-  long unsigned NumBBs = 0;
-};
+  /// Merge all information from \p PredED into the successor \p ED. If
+  /// \p InitialEdgeOnly is set, only the initial edge will enter the block
+  /// represented by \p ED from this predecessor.
+  void mergeInPredecessor(Attributor &A, ExecutionDomainTy &ED,
+                          const ExecutionDomainTy &PredED,
+                          bool InitialEdgeOnly = false);
 
-ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
-  Function *F = getAnchorScope();
-  ReversePostOrderTraversal<Function *> RPOT(F);
-  auto NumSingleThreadedBBs = SingleThreadedBBs.size();
+  /// Accumulate information for the entry block in \p EntryBBED.
+  void handleEntryBB(Attributor &A, ExecutionDomainTy &EntryBBED);
 
-  bool AllCallSitesKnown;
-  auto PredForCallSite = [&](AbstractCallSite ACS) {
-    const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
-        *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
-        DepClassTy::REQUIRED);
-    return ACS.isDirectCall() &&
-           ExecutionDomainAA.isExecutedByInitialThreadOnly(
-               *ACS.getInstruction());
-  };
+  /// See AbstractAttribute::updateImpl.
+  ChangeStatus updateImpl(Attributor &A) override;
 
-  if (!A.checkForAllCallSites(PredForCallSite, *this,
-                              /* RequiresAllCallSites */ true,
-                              AllCallSitesKnown))
-    SingleThreadedBBs.remove(&F->getEntryBlock());
+  /// Query interface, see AAExecutionDomain
+  ///{
+  bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
+    if (!isValidState())
+      return false;
+    return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
+  }
 
-  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
-  auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
+  ExecutionDomainTy getExecutionDomain(const BasicBlock &BB) const override {
+    assert(isValidState() &&
+           "No request should be made against an invalid state!");
+    return BEDMap.lookup(&BB);
+  }
+  ExecutionDomainTy getExecutionDomain(const CallBase &CB) const override {
+    assert(isValidState() &&
+           "No request should be made against an invalid state!");
+    return CEDMap.lookup(&CB);
+  }
+  ExecutionDomainTy getFunctionExecutionDomain() const override {
+    assert(isValidState() &&
+           "No request should be made against an invalid state!");
+    return BEDMap.lookup(nullptr);
+  }
+  ///}
 
   // Check if the edge into the successor block contains a condition that only
   // lets the main thread execute it.
-  auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
+  static bool isInitialThreadOnlyEdge(Attributor &A, BranchInst *Edge,
+                                      BasicBlock &SuccessorBB) {
     if (!Edge || !Edge->isConditional())
       return false;
-    if (Edge->getSuccessor(0) != SuccessorBB)
+    if (Edge->getSuccessor(0) != &SuccessorBB)
       return false;
 
     auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition());
@@ -2832,6 +2691,8 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     // Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
     if (C->isAllOnesValue()) {
       auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
+      auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+      auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
       CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
       if (!CB)
         return false;
@@ -2855,30 +2716,322 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
     return false;
   };
 
-  // Merge all the predecessor states into the current basic block. A basic
-  // block is executed by a single thread if all of its predecessors are.
-  auto MergePredecessorStates = [&](BasicBlock *BB) {
-    if (pred_empty(BB))
-      return SingleThreadedBBs.contains(BB);
-
-    bool IsInitialThread = true;
-    for (BasicBlock *PredBB : predecessors(BB)) {
-      if (!IsInitialThreadOnly(dyn_cast<BranchInst>(PredBB->getTerminator()),
-                               BB))
-        IsInitialThread &= SingleThreadedBBs.contains(PredBB);
+  /// Mapping containing information per block.
+  DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
+  DenseMap<const CallBase *, ExecutionDomainTy> CEDMap;
+  SmallSetVector<CallBase *, 16> AlignedBarriers;
+
+  ReversePostOrderTraversal<Function *> *RPOT = nullptr;
+};
+
+void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
+    Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED) {
+  for (auto *EA : PredED.EncounteredAssumes)
+    ED.addAssumeInst(A, *EA);
+
+  for (auto *AB : PredED.AlignedBarriers)
+    ED.addAlignedBarrier(A, *AB);
+}
+
+void AAExecutionDomainFunction::mergeInPredecessor(
+    Attributor &A, ExecutionDomainTy &ED, const ExecutionDomainTy &PredED,
+    bool InitialEdgeOnly) {
+  ED.IsExecutedByInitialThreadOnly =
+      InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
+                          ED.IsExecutedByInitialThreadOnly);
+
+  ED.IsReachedFromAlignedBarrierOnly = ED.IsReachedFromAlignedBarrierOnly &&
+                                       PredED.IsReachedFromAlignedBarrierOnly;
+  ED.EncounteredNonLocalSideEffect =
+      ED.EncounteredNonLocalSideEffect | PredED.EncounteredNonLocalSideEffect;
+  if (ED.IsReachedFromAlignedBarrierOnly)
+    mergeInPredecessorBarriersAndAssumptions(A, ED, PredED);
+  else
+    ED.clearAssumeInstAndAlignedBarriers();
+}
+
+void AAExecutionDomainFunction::handleEntryBB(Attributor &A,
+                                              ExecutionDomainTy &EntryBBED) {
+  SmallVector<ExecutionDomainTy> PredExecDomains;
+  auto PredForCallSite = [&](AbstractCallSite ACS) {
+    const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+        *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
+        DepClassTy::OPTIONAL);
+    if (!EDAA.getState().isValidState())
+      return false;
+    PredExecDomains.emplace_back(
+        EDAA.getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
+    return true;
+  };
+
+  bool AllCallSitesKnown;
+  if (A.checkForAllCallSites(PredForCallSite, *this,
+                             /* RequiresAllCallSites */ true,
+                             AllCallSitesKnown)) {
+    for (const auto &PredED : PredExecDomains)
+      mergeInPredecessor(A, EntryBBED, PredED);
+
+  } else {
+    // We could not find all predecessors, so this is either a kernel or a
+    // function with external linkage (or with some other weird uses).
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    if (OMPInfoCache.Kernels.count(getAnchorScope())) {
+      EntryBBED.IsExecutedByInitialThreadOnly = false;
+      EntryBBED.IsReachedFromAlignedBarrierOnly = true;
+      EntryBBED.EncounteredNonLocalSideEffect = false;
+    } else {
+      EntryBBED.IsExecutedByInitialThreadOnly = false;
+      EntryBBED.IsReachedFromAlignedBarrierOnly = false;
+      EntryBBED.EncounteredNonLocalSideEffect = true;
     }
+  }
+
+  auto &FnED = BEDMap[nullptr];
+  FnED.IsReachingAlignedBarrierOnly &=
+      EntryBBED.IsReachedFromAlignedBarrierOnly;
+}
+
+ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
+
+  bool Changed = false;
 
-    return IsInitialThread;
+  // Helper to deal with an aligned barrier encountered during the forward
+  // traversal. \p CB is the aligned barrier, \p ED is the execution domain when
+  // it was encountered.
+  auto HandleAlignedBarrier = [&](CallBase *CB, ExecutionDomainTy &ED) {
+    if (CB)
+      Changed |= AlignedBarriers.insert(CB);
+    // First, update the barrier ED kept in the separate CEDMap.
+    auto &CallED = CEDMap[CB];
+    mergeInPredecessor(A, CallED, ED);
+    // Next adjust the ED we use for the traversal.
+    ED.EncounteredNonLocalSideEffect = false;
+    ED.IsReachedFromAlignedBarrierOnly = true;
+    // Aligned barrier collection has to come last.
+    ED.clearAssumeInstAndAlignedBarriers();
+    ED.addAlignedBarrier(A, *CB);
   };
 
-  for (auto *BB : RPOT) {
-    if (!MergePredecessorStates(BB))
-      SingleThreadedBBs.remove(BB);
+  auto &LivenessAA =
+      A.getAAFor<AAIsDead>(*this, getIRPosition(), DepClassTy::OPTIONAL);
+
+  // Set \p R to \V and report true if that changed \p R.
+  auto SetAndRecord = [&](bool &R, bool V) {
+    bool Eq = (R == V);
+    R = V;
+    return !Eq;
+  };
+
+  auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+  Function *F = getAnchorScope();
+  BasicBlock &EntryBB = F->getEntryBlock();
+
+  SmallVector<Instruction *> SyncInstWorklist;
+  for (auto &RIt : *RPOT) {
+    BasicBlock &BB = *RIt;
+
+    ExecutionDomainTy ED;
+    // Propagate "incoming edges" into information about this block.
+    if (&BB == &EntryBB) {
+      handleEntryBB(A, ED);
+    } else {
+      // For live non-entry blocks we only propagate information via live edges.
+      if (LivenessAA.isAssumedDead(&BB))
+        continue;
+
+      for (auto *PredBB : predecessors(&BB)) {
+        if (LivenessAA.isEdgeDead(PredBB, &BB))
+          continue;
+        bool InitialEdgeOnly = isInitialThreadOnlyEdge(
+            A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
+        mergeInPredecessor(A, ED, BEDMap[PredBB], InitialEdgeOnly);
+      }
+    }
+
+    // Now we traverse the block, accumulate effects in ED and attach
+    // information to calls.
+    for (Instruction &I : BB) {
+      bool UsedAssumedInformation;
+      if (A.isAssumedDead(I, *this, &LivenessAA, UsedAssumedInformation,
+                          /* CheckBBLivenessOnly */ false, DepClassTy::OPTIONAL,
+                          /* CheckForDeadStore */ true))
+        continue;
+
+      // Asummes and "assume-like" (dbg, lifetime, ...) are handled first, the
+      // former is collected the latter is ignored.
+      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+        if (auto *AI = dyn_cast_or_null<AssumeInst>(II)) {
+          ED.addAssumeInst(A, *AI);
+          continue;
+        }
+        // TODO: Should we also collect and delete lifetime markers?
+        if (II->isAssumeLikeIntrinsic())
+          continue;
+      }
+
+      auto *CB = dyn_cast<CallBase>(&I);
+      bool IsNoSync = AA::isNoSyncInst(A, I, *this);
+      bool IsAlignedBarrier =
+          !IsNoSync && CB && AANoSync::isAlignedBarrier(*CB);
+
+      // Next we check for calls. Aligned barriers are handled
+      // explicitly, everything else is kept for the backward traversal and will
+      // also affect our state.
+      if (CB) {
+        if (IsAlignedBarrier) {
+          HandleAlignedBarrier(CB, ED);
+          continue;
+        }
+
+        // Check the pointer(s) of a memory intrinsic explicitly.
+        if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&I)) {
+          if (!ED.EncounteredNonLocalSideEffect &&
+              AA::isPotentiallyAffectedByBarrier(A, I, *this))
+            ED.EncounteredNonLocalSideEffect = true;
+          if (!IsNoSync) {
+            ED.IsReachedFromAlignedBarrierOnly = false;
+            SyncInstWorklist.push_back(&I);
+          }
+          continue;
+        }
+
+        // Record how we entered the call, then accumulate the effect of the
+        // call in ED for potential use by the callee.
+        auto &CallED = CEDMap[CB];
+        mergeInPredecessor(A, CallED, ED);
+
+        // If we have a sync-definition we can check if it starts/ends in an
+        // aligned barrier. If we are unsure we assume any sync breaks
+        // alignment.
+        Function *Callee = CB->getCalledFunction();
+        if (!IsNoSync && Callee && !Callee->isDeclaration()) {
+          const auto &EDAA = A.getAAFor<AAExecutionDomain>(
+              *this, IRPosition::function(*Callee), DepClassTy::OPTIONAL);
+          if (EDAA.getState().isValidState()) {
+            const auto &CalleeED = EDAA.getFunctionExecutionDomain();
+            ED.IsReachedFromAlignedBarrierOnly =
+                CalleeED.IsReachedFromAlignedBarrierOnly;
+            if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
+              ED.EncounteredNonLocalSideEffect |=
+                  CalleeED.EncounteredNonLocalSideEffect;
+            else
+              ED.EncounteredNonLocalSideEffect =
+                  CalleeED.EncounteredNonLocalSideEffect;
+            if (!CalleeED.IsReachingAlignedBarrierOnly)
+              SyncInstWorklist.push_back(&I);
+            if (CalleeED.IsReachedFromAlignedBarrierOnly)
+              mergeInPredecessorBarriersAndAssumptions(A, ED, CalleeED);
+            continue;
+          }
+        }
+        ED.IsReachedFromAlignedBarrierOnly =
+            IsNoSync && ED.IsReachedFromAlignedBarrierOnly;
+        ED.EncounteredNonLocalSideEffect |= true;
+        if (!IsNoSync)
+          SyncInstWorklist.push_back(&I);
+      }
+
+      if (!I.mayHaveSideEffects() && !I.mayReadFromMemory())
+        continue;
+
+      // If we have a callee we try to use fine-grained information to
+      // determine local side-effects.
+      if (CB) {
+        const auto &MemAA = A.getAAFor<AAMemoryLocation>(
+            *this, IRPosition::callsite_function(*CB), DepClassTy::OPTIONAL);
+
+        auto AccessPred = [&](const Instruction *I, const Value *Ptr,
+                              AAMemoryLocation::AccessKind,
+                              AAMemoryLocation::MemoryLocationsKind) {
+          return !AA::isPotentiallyAffectedByBarrier(A, {Ptr}, *this, I);
+        };
+        if (MemAA.getState().isValidState() &&
+            MemAA.checkForAllAccessesToMemoryKind(
+                AccessPred, AAMemoryLocation::ALL_LOCATIONS))
+          continue;
+      }
+
+      if (!I.mayHaveSideEffects() && OMPInfoCache.isOnlyUsedByAssume(I))
+        continue;
+
+      if (auto *LI = dyn_cast<LoadInst>(&I))
+        if (LI->hasMetadata(LLVMContext::MD_invariant_load))
+          continue;
+
+      if (!ED.EncounteredNonLocalSideEffect &&
+          AA::isPotentiallyAffectedByBarrier(A, I, *this))
+        ED.EncounteredNonLocalSideEffect = true;
+    }
+
+    if (!isa<UnreachableInst>(BB.getTerminator()) &&
+        !BB.getTerminator()->getNumSuccessors()) {
+
+      auto &FnED = BEDMap[nullptr];
+      mergeInPredecessor(A, FnED, ED);
+
+      if (OMPInfoCache.Kernels.count(F))
+        HandleAlignedBarrier(nullptr, ED);
+    }
+
+    ExecutionDomainTy &StoredED = BEDMap[&BB];
+    ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly;
+
+    // Check if we computed anything 
diff erent as part of the forward
+    // traversal. We do not take assumptions and aligned barriers into account
+    // as they do not influence the state we iterate. Backward traversal values
+    // are handled later on.
+    if (ED.IsExecutedByInitialThreadOnly !=
+            StoredED.IsExecutedByInitialThreadOnly ||
+        ED.IsReachedFromAlignedBarrierOnly !=
+            StoredED.IsReachedFromAlignedBarrierOnly ||
+        ED.EncounteredNonLocalSideEffect !=
+            StoredED.EncounteredNonLocalSideEffect)
+      Changed = true;
+
+    // Update the state with the new value.
+    StoredED = std::move(ED);
+  }
+
+  // Propagate (non-aligned) sync instruction effects backwards until the
+  // entry is hit or an aligned barrier.
+  SmallSetVector<BasicBlock *, 16> Visited;
+  while (!SyncInstWorklist.empty()) {
+    Instruction *SyncInst = SyncInstWorklist.pop_back_val();
+    Instruction *CurInst = SyncInst;
+    bool HitAlignedBarrier = false;
+    while ((CurInst = CurInst->getPrevNode())) {
+      auto *CB = dyn_cast<CallBase>(CurInst);
+      if (!CB)
+        continue;
+      auto &CallED = CEDMap[CB];
+      if (SetAndRecord(CallED.IsReachingAlignedBarrierOnly, false))
+        Changed = true;
+      HitAlignedBarrier = AlignedBarriers.count(CB);
+      if (HitAlignedBarrier)
+        break;
+    }
+    if (HitAlignedBarrier)
+      continue;
+    BasicBlock *SyncBB = SyncInst->getParent();
+    for (auto *PredBB : predecessors(SyncBB)) {
+      if (LivenessAA.isEdgeDead(PredBB, SyncBB))
+        continue;
+      if (!Visited.insert(PredBB))
+        continue;
+      SyncInstWorklist.push_back(PredBB->getTerminator());
+      auto &PredED = BEDMap[PredBB];
+      if (SetAndRecord(PredED.IsReachingAlignedBarrierOnly, false))
+        Changed = true;
+    }
+    if (SyncBB != &EntryBB)
+      continue;
+    auto &FnED = BEDMap[nullptr];
+    if (SetAndRecord(FnED.IsReachingAlignedBarrierOnly, false))
+      Changed = true;
   }
 
-  return (NumSingleThreadedBBs == SingleThreadedBBs.size())
-             ? ChangeStatus::UNCHANGED
-             : ChangeStatus::CHANGED;
+  return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
 }
 
 /// Try to replace memory allocation calls called by a single thread with a
@@ -2963,9 +3116,11 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
     Attributor::SimplifictionCallbackTy SCB =
         [](const IRPosition &, const AbstractAttribute *,
            bool &) -> std::optional<Value *> { return nullptr; };
+
+    Function *F = getAnchorScope();
     for (User *U : RFI.Declaration->users())
       if (CallBase *CB = dyn_cast<CallBase>(U)) {
-        if (CB->getCaller() != getAnchorScope())
+        if (CB->getFunction() != F)
           continue;
         MallocCalls.insert(CB);
         A.registerSimplificationCallback(IRPosition::callsite_returned(*CB),
@@ -3079,6 +3234,8 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       if (CallBase *CB = dyn_cast<CallBase>(U)) {
         if (CB->getCaller() != F)
           continue;
+        if (!MallocCalls.count(CB))
+          continue;
         if (!isa<ConstantInt>(CB->getArgOperand(0))) {
           MallocCalls.remove(CB);
           continue;

diff  --git a/llvm/test/Transforms/Attributor/reduced/openmp_opt_dont_follow_gep_without_value.ll b/llvm/test/Transforms/Attributor/reduced/openmp_opt_dont_follow_gep_without_value.ll
index 57cbcb98794f5..4a594ecaaf9c3 100644
--- a/llvm/test/Transforms/Attributor/reduced/openmp_opt_dont_follow_gep_without_value.ll
+++ b/llvm/test/Transforms/Attributor/reduced/openmp_opt_dont_follow_gep_without_value.ll
@@ -25,7 +25,6 @@ define weak_odr ptr @h(ptr %0) {
 ; CHECK: Function Attrs: norecurse nounwind memory(none)
 ; CHECK-LABEL: define {{[^@]+}}@f
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    ret void
 ;
 ;

diff  --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index a6d67d8b953e3..fc6bcf9391e7c 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt < %s -S -passes=openmp-opt-cgscc | FileCheck %s
+; RUN: opt < %s -S -passes=openmp-opt | FileCheck %s --check-prefixes=CHECK,MODULE
+; RUN: opt < %s -S -passes=openmp-opt-cgscc | FileCheck %s --check-prefixes=CHECK,CGSCC
+target triple = "amdgcn-amd-amdhsa"
 
 declare void @useI32(i32)
 declare void @unknown()
@@ -102,7 +104,6 @@ define void @pos_constant_loads() {
 ; CHECK-NEXT:    [[B:%.*]] = load i32, ptr addrspacecast (ptr addrspace(4) @GC2 to ptr), align 4
 ; CHECK-NEXT:    [[ARGC:%.*]] = addrspacecast ptr addrspace(4) [[ARG]] to ptr
 ; CHECK-NEXT:    [[C:%.*]] = load i32, ptr [[ARGC]], align 4
-; CHECK-NEXT:    call void @aligned_barrier()
 ; CHECK-NEXT:    [[D:%.*]] = add i32 42, [[B]]
 ; CHECK-NEXT:    [[E:%.*]] = add i32 [[D]], [[C]]
 ; CHECK-NEXT:    call void @useI32(i32 [[E]])
@@ -164,7 +165,6 @@ define void @pos_priv_mem() {
 ; CHECK-NEXT:    [[A:%.*]] = load i32, ptr @PG1, align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[LOC]], align 4
 ; CHECK-NEXT:    [[B:%.*]] = load i32, ptr addrspacecast (ptr addrspace(5) @PG2 to ptr), align 4
-; CHECK-NEXT:    call void @aligned_barrier()
 ; CHECK-NEXT:    [[ARGC:%.*]] = addrspacecast ptr addrspace(5) [[ARG]] to ptr
 ; CHECK-NEXT:    store i32 [[B]], ptr [[ARGC]], align 4
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[LOC]], align 4
@@ -228,31 +228,651 @@ define void @pos_multiple() {
   ret void
 }
 
-!llvm.module.flags = !{!12,!13}
-!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11}
+define void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) {
+; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_1
+; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:    br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]]
+; CHECK:       t0:
+; CHECK-NEXT:    br label [[T0B:%.*]]
+; CHECK:       t0b:
+; CHECK-NEXT:    br label [[M:%.*]]
+; CHECK:       f0:
+; CHECK-NEXT:    br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]]
+; CHECK:       t1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       f1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       m:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.nvvm.barrier0()
+  call void @aligned_barrier()
+  br i1 %c0, label %t0, label %f0
+t0:
+  call void @aligned_barrier()
+  br label %t0b
+t0b:
+  call void @aligned_barrier()
+  br label %m
+f0:
+  call void @aligned_barrier()
+  call void @llvm.nvvm.barrier0()
+  br i1 %c1, label %t1, label %f1
+t1:
+  call void @aligned_barrier()
+  br label %m
+f1:
+  call void @aligned_barrier()
+  br label %m
+m:
+  call void @aligned_barrier()
+  ret void
+}
+
+define void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, i32* %p) {
+; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_2
+; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 4, ptr [[P]], align 4
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]]
+; CHECK:       t0:
+; CHECK-NEXT:    br label [[T0B:%.*]]
+; CHECK:       t0b:
+; CHECK-NEXT:    br label [[M:%.*]]
+; CHECK:       f0:
+; CHECK-NEXT:    store i32 4, ptr [[P]], align 4
+; CHECK-NEXT:    call void @llvm.nvvm.barrier0()
+; CHECK-NEXT:    br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]]
+; CHECK:       t1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       f1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       m:
+; CHECK-NEXT:    store i32 4, ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.nvvm.barrier0()
+  store i32 4, i32* %p
+  call void @aligned_barrier()
+  br i1 %c0, label %t0, label %f0
+t0:
+  call void @aligned_barrier()
+  br label %t0b
+t0b:
+  call void @aligned_barrier()
+  br label %m
+f0:
+  call void @aligned_barrier()
+  store i32 4, i32* %p
+  call void @llvm.nvvm.barrier0()
+  br i1 %c1, label %t1, label %f1
+t1:
+  call void @aligned_barrier()
+  br label %m
+f1:
+  call void @aligned_barrier()
+  br label %m
+m:
+  store i32 4, i32* %p
+  call void @aligned_barrier()
+  ret void
+}
+
+define void @multiple_blocks_non_kernel_1(i1 %c0, i1 %c1) {
+; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_non_kernel_1
+; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:    call void @llvm.nvvm.barrier0()
+; CHECK-NEXT:    br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]]
+; CHECK:       t0:
+; CHECK-NEXT:    br label [[T0B:%.*]]
+; CHECK:       t0b:
+; CHECK-NEXT:    br label [[M:%.*]]
+; CHECK:       f0:
+; CHECK-NEXT:    br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]]
+; CHECK:       t1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       f1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       m:
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.nvvm.barrier0()
+  call void @aligned_barrier()
+  br i1 %c0, label %t0, label %f0
+t0:
+  call void @aligned_barrier()
+  br label %t0b
+t0b:
+  call void @aligned_barrier()
+  br label %m
+f0:
+  call void @aligned_barrier()
+  call void @llvm.nvvm.barrier0()
+  br i1 %c1, label %t1, label %f1
+t1:
+  call void @aligned_barrier()
+  br label %m
+f1:
+  call void @aligned_barrier()
+  br label %m
+m:
+  call void @aligned_barrier()
+  ret void
+}
+
+define void @multiple_blocks_non_kernel_2(i1 %c0, i1 %c1) {
+; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_non_kernel_2
+; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:    br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]]
+; CHECK:       t0:
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    br label [[T0B:%.*]]
+; CHECK:       t0b:
+; CHECK-NEXT:    br label [[M:%.*]]
+; CHECK:       f0:
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]]
+; CHECK:       t1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       f1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       m:
+; CHECK-NEXT:    ret void
+;
+  br i1 %c0, label %t0, label %f0
+t0:
+  call void @aligned_barrier()
+  br label %t0b
+t0b:
+  call void @aligned_barrier()
+  br label %m
+f0:
+  call void @aligned_barrier()
+  call void @llvm.nvvm.barrier0()
+  br i1 %c1, label %t1, label %f1
+t1:
+  call void @aligned_barrier()
+  br label %m
+f1:
+  call void @aligned_barrier()
+  br label %m
+m:
+  call void @aligned_barrier()
+  ret void
+}
+
+define void @multiple_blocks_non_kernel_3(i1 %c0, i1 %c1) {
+; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_non_kernel_3
+; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:    br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]]
+; CHECK:       t0:
+; CHECK-NEXT:    br label [[T0B:%.*]]
+; CHECK:       t0b:
+; CHECK-NEXT:    br label [[M:%.*]]
+; CHECK:       f0:
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]]
+; CHECK:       t1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       f1:
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       m:
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    ret void
+;
+  br i1 %c0, label %t0, label %f0
+t0:
+  br label %t0b
+t0b:
+  br label %m
+f0:
+  call void @aligned_barrier()
+  call void @llvm.nvvm.barrier0()
+  br i1 %c1, label %t1, label %f1
+t1:
+  call void @aligned_barrier()
+  br label %m
+f1:
+  call void @aligned_barrier()
+  br label %m
+m:
+  call void @aligned_barrier()
+  ret void
+}
+
+define void @multiple_blocks_non_kernel_effects_1(i1 %c0, i1 %c1, i32* %p) {
+; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_non_kernel_effects_1
+; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]]
+; CHECK:       t0:
+; CHECK-NEXT:    store i32 1, ptr [[P]], align 4
+; CHECK-NEXT:    br label [[T0B:%.*]]
+; CHECK:       t0b:
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    br label [[M:%.*]]
+; CHECK:       f0:
+; CHECK-NEXT:    store i32 2, ptr [[P]], align 4
+; CHECK-NEXT:    br i1 [[C1]], label [[T1:%.*]], label [[F1:%.*]]
+; CHECK:       t1:
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       f1:
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    br label [[M]]
+; CHECK:       m:
+; CHECK-NEXT:    store i32 3, ptr [[P]], align 4
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    ret void
+;
+  call void @aligned_barrier()
+  store i32 0, i32* %p
+  call void @aligned_barrier()
+  br i1 %c0, label %t0, label %f0
+t0:
+  call void @aligned_barrier()
+  store i32 1, i32* %p
+  br label %t0b
+t0b:
+  call void @aligned_barrier()
+  br label %m
+f0:
+  call void @aligned_barrier()
+  call void @llvm.nvvm.barrier0()
+  store i32 2, i32* %p
+  br i1 %c1, label %t1, label %f1
+t1:
+  call void @aligned_barrier()
+  br label %m
+f1:
+  call void @aligned_barrier()
+  br label %m
+m:
+  call void @aligned_barrier()
+  store i32 3, i32* %p
+  call void @aligned_barrier()
+  ret void
+}
+
+define internal void @write_then_barrier0(i32* %p) {
+; MODULE-LABEL: define {{[^@]+}}@write_then_barrier0
+; MODULE-SAME: (ptr [[P:%.*]]) {
+; MODULE-NEXT:    store i32 0, ptr [[P]], align 4
+; MODULE-NEXT:    ret void
+;
+; CGSCC-LABEL: define {{[^@]+}}@write_then_barrier0
+; CGSCC-SAME: (ptr [[P:%.*]]) {
+; CGSCC-NEXT:    store i32 0, ptr [[P]], align 4
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    ret void
+;
+  store i32 0, i32* %p
+  call void @aligned_barrier()
+  ret void
+}
+define internal void @barrier_then_write0(i32* %p) {
+; MODULE-LABEL: define {{[^@]+}}@barrier_then_write0
+; MODULE-SAME: (ptr [[P:%.*]]) {
+; MODULE-NEXT:    store i32 0, ptr [[P]], align 4
+; MODULE-NEXT:    ret void
+;
+; CGSCC-LABEL: define {{[^@]+}}@barrier_then_write0
+; CGSCC-SAME: (ptr [[P:%.*]]) {
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    store i32 0, ptr [[P]], align 4
+; CGSCC-NEXT:    ret void
+;
+  call void @aligned_barrier()
+  store i32 0, i32* %p
+  ret void
+}
+define internal void @barrier_then_write_then_barrier0(i32* %p) {
+; MODULE-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier0
+; MODULE-SAME: (ptr [[P:%.*]]) {
+; MODULE-NEXT:    store i32 0, ptr [[P]], align 4
+; MODULE-NEXT:    call void @aligned_barrier()
+; MODULE-NEXT:    ret void
+;
+; CGSCC-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier0
+; CGSCC-SAME: (ptr [[P:%.*]]) {
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    store i32 0, ptr [[P]], align 4
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    ret void
+;
+  call void @aligned_barrier()
+  store i32 0, i32* %p
+  call void @aligned_barrier()
+  ret void
+}
+define void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, i32* %p) {
+; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_kernel_effects_0
+; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) {
+; MODULE-NEXT:    call void @barrier_then_write_then_barrier0(ptr [[P]])
+; MODULE-NEXT:    br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]]
+; MODULE:       t03:
+; MODULE-NEXT:    call void @barrier_then_write0(ptr [[P]])
+; MODULE-NEXT:    br label [[T0B3:%.*]]
+; MODULE:       t0b3:
+; MODULE-NEXT:    br label [[M3:%.*]]
+; MODULE:       f03:
+; MODULE-NEXT:    call void @barrier_then_write0(ptr [[P]])
+; MODULE-NEXT:    br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]]
+; MODULE:       t13:
+; MODULE-NEXT:    br label [[M3]]
+; MODULE:       f13:
+; MODULE-NEXT:    br label [[M3]]
+; MODULE:       m3:
+; MODULE-NEXT:    call void @write_then_barrier0(ptr [[P]])
+; MODULE-NEXT:    ret void
+;
+; CGSCC-LABEL: define {{[^@]+}}@multiple_blocks_functions_kernel_effects_0
+; CGSCC-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) {
+; CGSCC-NEXT:    call void @barrier_then_write_then_barrier0(ptr [[P]])
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]]
+; CGSCC:       t03:
+; CGSCC-NEXT:    call void @barrier_then_write0(ptr [[P]])
+; CGSCC-NEXT:    br label [[T0B3:%.*]]
+; CGSCC:       t0b3:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br label [[M3:%.*]]
+; CGSCC:       f03:
+; CGSCC-NEXT:    call void @barrier_then_write0(ptr [[P]])
+; CGSCC-NEXT:    br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]]
+; CGSCC:       t13:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br label [[M3]]
+; CGSCC:       f13:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br label [[M3]]
+; CGSCC:       m3:
+; CGSCC-NEXT:    call void @write_then_barrier0(ptr [[P]])
+; CGSCC-NEXT:    ret void
+;
+  call void @barrier_then_write_then_barrier0(i32* %p)
+  call void @aligned_barrier()
+  br i1 %c0, label %t03, label %f03
+t03:
+  call void @barrier_then_write0(i32* %p)
+  br label %t0b3
+t0b3:
+  call void @aligned_barrier()
+  br label %m3
+f03:
+  call void @aligned_barrier()
+  call void @barrier_then_write0(i32* %p)
+  br i1 %c1, label %t13, label %f13
+t13:
+  call void @aligned_barrier()
+  br label %m3
+f13:
+  call void @aligned_barrier()
+  br label %m3
+m3:
+  call void @aligned_barrier()
+  call void @write_then_barrier0(i32* %p)
+  ret void
+}
+define internal void @write_then_barrier1(i32* %p) {
+; CHECK-LABEL: define {{[^@]+}}@write_then_barrier1
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %p
+  call void @aligned_barrier()
+  ret void
+}
+define internal void @barrier_then_write1(i32* %p) {
+; MODULE-LABEL: define {{[^@]+}}@barrier_then_write1
+; MODULE-SAME: (ptr [[P:%.*]]) {
+; MODULE-NEXT:    store i32 0, ptr [[P]], align 4
+; MODULE-NEXT:    ret void
+;
+; CGSCC-LABEL: define {{[^@]+}}@barrier_then_write1
+; CGSCC-SAME: (ptr [[P:%.*]]) {
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    store i32 0, ptr [[P]], align 4
+; CGSCC-NEXT:    ret void
+;
+  call void @aligned_barrier()
+  store i32 0, i32* %p
+  ret void
+}
+define internal void @barrier_then_write_then_barrier1(i32* %p) {
+; CHECK-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier1
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    ret void
+;
+  call void @aligned_barrier()
+  store i32 0, i32* %p
+  call void @aligned_barrier()
+  ret void
+}
+define void @multiple_blocks_functions_non_kernel_effects_1(i1 %c0, i1 %c1, i32* %p) {
+; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_non_kernel_effects_1
+; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) {
+; MODULE-NEXT:    call void @barrier_then_write_then_barrier1(ptr [[P]])
+; MODULE-NEXT:    br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]]
+; MODULE:       t03:
+; MODULE-NEXT:    call void @barrier_then_write1(ptr [[P]])
+; MODULE-NEXT:    br label [[T0B3:%.*]]
+; MODULE:       t0b3:
+; MODULE-NEXT:    call void @aligned_barrier()
+; MODULE-NEXT:    br label [[M3:%.*]]
+; MODULE:       f03:
+; MODULE-NEXT:    call void @barrier_then_write1(ptr [[P]])
+; MODULE-NEXT:    br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]]
+; MODULE:       t13:
+; MODULE-NEXT:    call void @aligned_barrier()
+; MODULE-NEXT:    br label [[M3]]
+; MODULE:       f13:
+; MODULE-NEXT:    call void @aligned_barrier()
+; MODULE-NEXT:    br label [[M3]]
+; MODULE:       m3:
+; MODULE-NEXT:    call void @write_then_barrier1(ptr [[P]])
+; MODULE-NEXT:    ret void
+;
+; CGSCC-LABEL: define {{[^@]+}}@multiple_blocks_functions_non_kernel_effects_1
+; CGSCC-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) {
+; CGSCC-NEXT:    call void @barrier_then_write_then_barrier1(ptr [[P]])
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]]
+; CGSCC:       t03:
+; CGSCC-NEXT:    call void @barrier_then_write1(ptr [[P]])
+; CGSCC-NEXT:    br label [[T0B3:%.*]]
+; CGSCC:       t0b3:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br label [[M3:%.*]]
+; CGSCC:       f03:
+; CGSCC-NEXT:    call void @barrier_then_write1(ptr [[P]])
+; CGSCC-NEXT:    br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]]
+; CGSCC:       t13:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br label [[M3]]
+; CGSCC:       f13:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br label [[M3]]
+; CGSCC:       m3:
+; CGSCC-NEXT:    call void @write_then_barrier1(ptr [[P]])
+; CGSCC-NEXT:    ret void
+;
+  call void @barrier_then_write_then_barrier1(i32* %p)
+  call void @aligned_barrier()
+  br i1 %c0, label %t03, label %f03
+t03:
+  call void @barrier_then_write1(i32* %p)
+  br label %t0b3
+t0b3:
+  call void @aligned_barrier()
+  br label %m3
+f03:
+  call void @aligned_barrier()
+  call void @barrier_then_write1(i32* %p)
+  br i1 %c1, label %t13, label %f13
+t13:
+  call void @aligned_barrier()
+  br label %m3
+f13:
+  call void @aligned_barrier()
+  br label %m3
+m3:
+  call void @aligned_barrier()
+  call void @write_then_barrier1(i32* %p)
+  ret void
+}
+
+define internal void @write_then_barrier2(i32* %p) {
+; CHECK-LABEL: define {{[^@]+}}@write_then_barrier2
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %p
+  call void @aligned_barrier()
+  ret void
+}
+define internal void @barrier_then_write2(i32* %p) {
+; CHECK-LABEL: define {{[^@]+}}@barrier_then_write2
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+  call void @aligned_barrier()
+  store i32 0, i32* %p
+  ret void
+}
+define internal void @barrier_then_write_then_barrier2(i32* %p) {
+; CHECK-LABEL: define {{[^@]+}}@barrier_then_write_then_barrier2
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    store i32 0, ptr [[P]], align 4
+; CHECK-NEXT:    call void @aligned_barrier()
+; CHECK-NEXT:    ret void
+;
+  call void @aligned_barrier()
+  store i32 0, i32* %p
+  call void @aligned_barrier()
+  ret void
+}
+define void @multiple_blocks_functions_non_kernel_effects_2(i1 %c0, i1 %c1, i32* %p) {
+; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_non_kernel_effects_2
+; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) {
+; MODULE-NEXT:    call void @barrier_then_write_then_barrier2(ptr [[P]])
+; MODULE-NEXT:    store i32 0, ptr [[P]], align 4
+; MODULE-NEXT:    br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]]
+; MODULE:       t03:
+; MODULE-NEXT:    call void @barrier_then_write2(ptr [[P]])
+; MODULE-NEXT:    br label [[T0B3:%.*]]
+; MODULE:       t0b3:
+; MODULE-NEXT:    call void @aligned_barrier()
+; MODULE-NEXT:    br label [[M3:%.*]]
+; MODULE:       f03:
+; MODULE-NEXT:    call void @aligned_barrier()
+; MODULE-NEXT:    call void @barrier_then_write2(ptr [[P]])
+; MODULE-NEXT:    br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]]
+; MODULE:       t13:
+; MODULE-NEXT:    call void @aligned_barrier()
+; MODULE-NEXT:    br label [[M3]]
+; MODULE:       f13:
+; MODULE-NEXT:    call void @aligned_barrier()
+; MODULE-NEXT:    br label [[M3]]
+; MODULE:       m3:
+; MODULE-NEXT:    call void @write_then_barrier2(ptr [[P]])
+; MODULE-NEXT:    store i32 0, ptr [[P]], align 4
+; MODULE-NEXT:    ret void
+;
+; CGSCC-LABEL: define {{[^@]+}}@multiple_blocks_functions_non_kernel_effects_2
+; CGSCC-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) {
+; CGSCC-NEXT:    call void @barrier_then_write_then_barrier2(ptr [[P]])
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    store i32 0, ptr [[P]], align 4
+; CGSCC-NEXT:    br i1 [[C0]], label [[T03:%.*]], label [[F03:%.*]]
+; CGSCC:       t03:
+; CGSCC-NEXT:    call void @barrier_then_write2(ptr [[P]])
+; CGSCC-NEXT:    br label [[T0B3:%.*]]
+; CGSCC:       t0b3:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br label [[M3:%.*]]
+; CGSCC:       f03:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    call void @barrier_then_write2(ptr [[P]])
+; CGSCC-NEXT:    br i1 [[C1]], label [[T13:%.*]], label [[F13:%.*]]
+; CGSCC:       t13:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br label [[M3]]
+; CGSCC:       f13:
+; CGSCC-NEXT:    call void @aligned_barrier()
+; CGSCC-NEXT:    br label [[M3]]
+; CGSCC:       m3:
+; CGSCC-NEXT:    call void @write_then_barrier2(ptr [[P]])
+; CGSCC-NEXT:    store i32 0, ptr [[P]], align 4
+; CGSCC-NEXT:    ret void
+;
+  call void @barrier_then_write_then_barrier2(i32* %p)
+  call void @aligned_barrier()
+  store i32 0, i32* %p
+  br i1 %c0, label %t03, label %f03
+t03:
+  call void @barrier_then_write2(i32* %p)
+  br label %t0b3
+t0b3:
+  call void @aligned_barrier()
+  br label %m3
+f03:
+  call void @aligned_barrier()
+  call void @barrier_then_write2(i32* %p)
+  br i1 %c1, label %t13, label %f13
+t13:
+  call void @aligned_barrier()
+  br label %m3
+f13:
+  call void @aligned_barrier()
+  br label %m3
+m3:
+  call void @aligned_barrier()
+  call void @write_then_barrier2(i32* %p)
+  store i32 0, i32* %p
+  ret void
+}
+
+!llvm.module.flags = !{!16,!15}
+!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14}
 
-!0 = !{ptr @pos_empty_1, !"kernel", i32 1}
-!1 = !{ptr @pos_empty_2, !"kernel", i32 1}
-!2 = !{ptr @pos_empty_3, !"kernel", i32 1}
-!3 = !{ptr @pos_empty_4, !"kernel", i32 1}
-!4 = !{ptr @pos_empty_5, !"kernel", i32 1}
-!5 = !{ptr @pos_empty_6, !"kernel", i32 1}
-!6 = !{ptr @neg_empty_7, !"kernel", i32 1}
-!7 = !{ptr @pos_constant_loads, !"kernel", i32 1}
-!8 = !{ptr @neg_loads, !"kernel", i32 1}
-!9 = !{ptr @pos_priv_mem, !"kernel", i32 1}
-!10 = !{ptr @neg_mem, !"kernel", i32 1}
-!11 = !{ptr @pos_multiple, !"kernel", i32 1}
-!12 = !{i32 7, !"openmp", i32 50}
-!13 = !{i32 7, !"openmp-device", i32 50}
+!0 = !{void ()* @pos_empty_1, !"kernel", i32 1}
+!1 = !{void ()* @pos_empty_2, !"kernel", i32 1}
+!2 = !{void ()* @pos_empty_3, !"kernel", i32 1}
+!3 = !{void ()* @pos_empty_4, !"kernel", i32 1}
+!4 = !{void ()* @pos_empty_5, !"kernel", i32 1}
+!5 = !{void ()* @pos_empty_6, !"kernel", i32 1}
+!6 = !{void ()* @neg_empty_7, !"kernel", i32 1}
+!7 = !{void ()* @pos_constant_loads, !"kernel", i32 1}
+!8 = !{void ()* @neg_loads, !"kernel", i32 1}
+!9 = !{void ()* @pos_priv_mem, !"kernel", i32 1}
+!10 = !{void ()* @neg_mem, !"kernel", i32 1}
+!11 = !{void ()* @pos_multiple, !"kernel", i32 1}
+!12 = !{void (i1,i1)* @multiple_blocks_kernel_1, !"kernel", i32 1}
+!13 = !{void (i1,i1,i32*)* @multiple_blocks_kernel_2, !"kernel", i32 1}
+!14 = !{void (i1,i1,i32*)* @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
+!15 = !{i32 7, !"openmp", i32 50}
+!16 = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_aligned_barrier" }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent nocallback nounwind }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nocallback nofree nounwind willreturn }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META2:![0-9]+]] = !{ptr @pos_empty_1, !"kernel", i32 1}
 ; CHECK: [[META3:![0-9]+]] = !{ptr @pos_empty_2, !"kernel", i32 1}
 ; CHECK: [[META4:![0-9]+]] = !{ptr @pos_empty_3, !"kernel", i32 1}
@@ -265,4 +885,7 @@ define void @pos_multiple() {
 ; CHECK: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1}
 ; CHECK: [[META12:![0-9]+]] = !{ptr @neg_mem, !"kernel", i32 1}
 ; CHECK: [[META13:![0-9]+]] = !{ptr @pos_multiple, !"kernel", i32 1}
+; CHECK: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1}
+; CHECK: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1}
+; CHECK: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
 ;.

diff  --git a/llvm/test/Transforms/OpenMP/deduplication_target.ll b/llvm/test/Transforms/OpenMP/deduplication_target.ll
index bfb9e3bc457a0..f1e9d656e80cc 100644
--- a/llvm/test/Transforms/OpenMP/deduplication_target.ll
+++ b/llvm/test/Transforms/OpenMP/deduplication_target.ll
@@ -18,7 +18,6 @@ define weak void @__omp_offloading_50_a3e09bf8_foo_l2() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_50_a3e09bf8_foo_l2
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1:[0-9]+]], i8 2, i1 false)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1

diff  --git a/llvm/test/Transforms/OpenMP/heap-to-shared-missing-declarations.ll b/llvm/test/Transforms/OpenMP/heap-to-shared-missing-declarations.ll
index 13a45c2c11dad..154e549b19e9b 100644
--- a/llvm/test/Transforms/OpenMP/heap-to-shared-missing-declarations.ll
+++ b/llvm/test/Transforms/OpenMP/heap-to-shared-missing-declarations.ll
@@ -7,7 +7,7 @@ define internal void @outlined0() {
 ; CHECK-LABEL: define {{[^@]+}}@outlined0
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    call void @func() #[[ATTR0]]
+; CHECK-NEXT:    call void @func() #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block() #[[ATTR0]]
 ; CHECK-NEXT:    ret void
 ;
@@ -18,9 +18,9 @@ bb:
 }
 
 define internal void @func() {
-; CHECK: Function Attrs: nounwind
+; CHECK: Function Attrs: nosync nounwind
 ; CHECK-LABEL: define {{[^@]+}}@func
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[I:%.*]] = load ptr, ptr null, align 4294967296
 ; CHECK-NEXT:    store i64 0, ptr [[I]], align 8
@@ -33,16 +33,16 @@ bb:
 }
 
 define internal void @outlined1() {
-; CHECK: Function Attrs: nounwind
+; CHECK: Function Attrs: nosync nounwind
 ; CHECK-LABEL: define {{[^@]+}}@outlined1
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[I:%.*]] = icmp sle i32 1, 0
 ; CHECK-NEXT:    br i1 [[I]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       bb1:
-; CHECK-NEXT:    call void @func() #[[ATTR0]]
+; CHECK-NEXT:    call void @func() #[[ATTR1]]
 ; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    call void @__kmpc_free_shared(ptr null, i64 0) #[[ATTR0]]
@@ -67,7 +67,7 @@ bb2:                                              ; preds = %bb
 define void @user() {
 ; CHECK-LABEL: define {{[^@]+}}@user() {
 ; CHECK-NEXT:    call void @outlined0() #[[ATTR0]]
-; CHECK-NEXT:    call void @outlined1() #[[ATTR0]]
+; CHECK-NEXT:    call void @outlined1() #[[ATTR1]]
 ; CHECK-NEXT:    ret void
 ;
   call void @outlined0()
@@ -84,7 +84,7 @@ declare void @__kmpc_free_shared(ptr, i64)
 !1 = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { nounwind }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nosync nounwind }
+; CHECK: attributes #[[ATTR1]] = { nosync nounwind }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}

diff  --git a/llvm/test/Transforms/OpenMP/internals_cgscc.ll b/llvm/test/Transforms/OpenMP/internals_cgscc.ll
index 968a4654bcde5..3124bcf539d29 100644
--- a/llvm/test/Transforms/OpenMP/internals_cgscc.ll
+++ b/llvm/test/Transforms/OpenMP/internals_cgscc.ll
@@ -27,7 +27,6 @@ define internal void @foo() {
 
 define internal void @bar() {
 ; CHECK-LABEL: @bar(
-; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    ret void
 ;
   call void @foo()

diff  --git a/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll b/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll
index 855bab8b5d944..eb1ae5f0bc2c1 100644
--- a/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll
+++ b/llvm/test/Transforms/OpenMP/reduced_pointer_info_assertion.ll
@@ -12,13 +12,7 @@ define internal i32 @nblist() {
   ret i32 0
 }
 
-
 define fastcc void @rec(ptr %0, i64 %1) {
-; CHECK-LABEL: define {{[^@]+}}@rec(
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0:%.*]], i64 [[TMP1:%.*]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 4
-; CHECK-NEXT:    call fastcc void @rec(ptr [[TMP0]], i64 0)
-; CHECK-NEXT:    ret void
   %3 = getelementptr i32, ptr %0, i64 %1
   store i32 0, ptr %3, align 4
   call fastcc void @rec(ptr %0, i64 0)
@@ -44,9 +38,9 @@ define fastcc void @rec(ptr %0, i64 %1) {
 ;
 ;
 ; CGSCC-LABEL: define {{[^@]+}}@rec
-; CGSCC-SAME: (ptr [[TMP0:%.*]], i64 [[TMP1:%.*]]) {
+; CGSCC-SAME: (ptr nocapture writeonly [[TMP0:%.*]], i64 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CGSCC-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP0]], i64 [[TMP1]]
 ; CGSCC-NEXT:    store i32 0, ptr [[TMP3]], align 4
-; CGSCC-NEXT:    call fastcc void @rec(ptr [[TMP0]], i64 0)
+; CGSCC-NEXT:    call fastcc void @rec(ptr nocapture writeonly [[TMP0]], i64 0) #[[ATTR1:[0-9]+]]
 ; CGSCC-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
index 046f39ebed504..203de81aaf29d 100644
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -19,8 +19,10 @@ target triple = "nvptx64"
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
 ;.
 ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
+; CHECK: @[[KERNEL_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
 ;.
 ; CHECK-DISABLED: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
+; CHECK-DISABLED: @[[KERNEL_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
 ;.
 define weak i32 @__kmpc_target_init(ptr, i8, i1) {
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
@@ -70,17 +72,17 @@ define internal void @foo() {
 ; CHECK-LABEL: define {{[^@]+}}@foo
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 1
+; CHECK-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 4
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@foo
 ; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
-; CHECK-DISABLED-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 1
+; CHECK-DISABLED-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 4
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 entry:
-  %0 = call ptr @__kmpc_alloc_shared(i64 4), !dbg !12
+  %0 = call align 4 ptr @__kmpc_alloc_shared(i64 4), !dbg !12
   call void @use(ptr %0)
   call void @__kmpc_free_shared(ptr %0, i64 4)
   ret void
@@ -88,52 +90,42 @@ entry:
 
 define internal void @bar() {
 ; CHECK-LABEL: define {{[^@]+}}@bar
-; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call ptr @__kmpc_alloc_shared(i64 4) #[[ATTR0]], !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT:    call void @share(ptr nofree [[TMP0]]) #[[ATTR1]], !dbg [[DBG8]]
-; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[TMP0]], i64 4) #[[ATTR0]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR4:[0-9]+]], !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    call void @share(ptr nofree [[TMP0]]) #[[ATTR0]], !dbg [[DBG8]]
+; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[TMP0]], i64 4) #[[ATTR4]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@bar
-; CHECK-DISABLED-SAME: () #[[ATTR1:[0-9]+]] {
+; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call ptr @__kmpc_alloc_shared(i64 4) #[[ATTR0]], !dbg [[DBG8:![0-9]+]]
-; CHECK-DISABLED-NEXT:    call void @share(ptr nofree [[TMP0]]) #[[ATTR1]], !dbg [[DBG8]]
-; CHECK-DISABLED-NEXT:    call void @__kmpc_free_shared(ptr [[TMP0]], i64 4) #[[ATTR0]]
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR4:[0-9]+]], !dbg [[DBG8:![0-9]+]]
+; CHECK-DISABLED-NEXT:    call void @share(ptr nofree [[TMP0]]) #[[ATTR0]], !dbg [[DBG8]]
+; CHECK-DISABLED-NEXT:    call void @__kmpc_free_shared(ptr [[TMP0]], i64 4) #[[ATTR4]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 entry:
-  %0 = call ptr @__kmpc_alloc_shared(i64 4), !dbg !13
+  %0 = call align 4 ptr @__kmpc_alloc_shared(i64 4), !dbg !13
   call void @share(ptr %0), !dbg !13
   call void @__kmpc_free_shared(ptr %0, i64 4)
   ret void
 }
 
 define internal void @use(ptr %x) {
-; CHECK-LABEL: define {{[^@]+}}@use
-; CHECK-SAME: (ptr [[X:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    ret void
-;
-; CHECK-DISABLED-LABEL: define {{[^@]+}}@use
-; CHECK-DISABLED-SAME: (ptr [[X:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-DISABLED-NEXT:  entry:
-; CHECK-DISABLED-NEXT:    ret void
-;
 entry:
   ret void
 }
 
 define internal void @share(ptr %x) {
 ; CHECK-LABEL: define {{[^@]+}}@share
-; CHECK-SAME: (ptr nofree [[X:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-SAME: (ptr nofree [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    store ptr [[X]], ptr @S, align 8
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@share
-; CHECK-DISABLED-SAME: (ptr nofree [[X:%.*]]) #[[ATTR3:[0-9]+]] {
+; CHECK-DISABLED-SAME: (ptr nofree [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    store ptr [[X]], ptr @S, align 8
 ; CHECK-DISABLED-NEXT:    ret void
@@ -146,19 +138,17 @@ entry:
 define void @unused() {
 ; CHECK-LABEL: define {{[^@]+}}@unused() {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 1
-; CHECK-NEXT:    call void @use(ptr undef)
+; CHECK-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 4
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@unused() {
 ; CHECK-DISABLED-NEXT:  entry:
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call ptr @__kmpc_alloc_shared(i64 4), !dbg [[DBG11:![0-9]+]]
-; CHECK-DISABLED-NEXT:    call void @use(ptr [[TMP0]])
-; CHECK-DISABLED-NEXT:    call void @__kmpc_free_shared(ptr [[TMP0]], i64 4)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR4]], !dbg [[DBG11:![0-9]+]]
+; CHECK-DISABLED-NEXT:    call void @__kmpc_free_shared(ptr [[TMP0]], i64 4) #[[ATTR4]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 entry:
-  %0 = call ptr @__kmpc_alloc_shared(i64 4), !dbg !14
+  %0 = call align 4 ptr @__kmpc_alloc_shared(i64 4), !dbg !14
   call void @use(ptr %0)
   call void @__kmpc_free_shared(ptr %0, i64 4)
   ret void
@@ -166,9 +156,9 @@ entry:
 
 define internal void @convert_and_move_alloca() {
 ; CHECK-LABEL: define {{[^@]+}}@convert_and_move_alloca
-; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 1
+; CHECK-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 4
 ; CHECK-NEXT:    [[IV_PTR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    br label [[INITLOOP:%.*]]
 ; CHECK:       initloop:
@@ -186,9 +176,9 @@ define internal void @convert_and_move_alloca() {
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@convert_and_move_alloca
-; CHECK-DISABLED-SAME: () #[[ATTR1]] {
+; CHECK-DISABLED-SAME: () #[[ATTR0]] {
 ; CHECK-DISABLED-NEXT:  entry:
-; CHECK-DISABLED-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 1
+; CHECK-DISABLED-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 4
 ; CHECK-DISABLED-NEXT:    [[IV_PTR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    br label [[INITLOOP:%.*]]
 ; CHECK-DISABLED:       initloop:
@@ -217,7 +207,7 @@ initloop:
   br label %loopbody
 
 loopbody:
-  %0 = call ptr @__kmpc_alloc_shared(i64 4), !dbg !16
+  %0 = call align 4 ptr @__kmpc_alloc_shared(i64 4), !dbg !16
   call void @use(ptr %0)
   call void @__kmpc_free_shared(ptr %0, i64 4)
   %iv = load i32, ptr %iv_ptr
@@ -263,19 +253,17 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 !15 = !DILocation(line: 8, column: 2, scope: !9)
 !16 = !DILocation(line: 10, column: 2, scope: !9)
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind }
-; CHECK: attributes #[[ATTR1]] = { nosync nounwind }
-; CHECK: attributes #[[ATTR2]] = { nounwind memory(none) }
-; CHECK: attributes #[[ATTR3]] = { nofree norecurse nosync nounwind memory(write) }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind allocsize(0) }
-; CHECK: attributes #[[ATTR5:[0-9]+]] = { "llvm.assume"="omp_no_openmp" }
+; CHECK: attributes #[[ATTR0]] = { nosync nounwind }
+; CHECK: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind memory(write) }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nosync nounwind allocsize(0) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="omp_no_openmp" }
+; CHECK: attributes #[[ATTR4]] = { nounwind }
 ;.
-; CHECK-DISABLED: attributes #[[ATTR0]] = { nounwind }
-; CHECK-DISABLED: attributes #[[ATTR1]] = { nosync nounwind }
-; CHECK-DISABLED: attributes #[[ATTR2]] = { nounwind memory(none) }
-; CHECK-DISABLED: attributes #[[ATTR3]] = { nofree norecurse nosync nounwind memory(write) }
-; CHECK-DISABLED: attributes #[[ATTR4:[0-9]+]] = { nosync nounwind allocsize(0) }
-; CHECK-DISABLED: attributes #[[ATTR5:[0-9]+]] = { "llvm.assume"="omp_no_openmp" }
+; CHECK-DISABLED: attributes #[[ATTR0]] = { nosync nounwind }
+; CHECK-DISABLED: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind memory(write) }
+; CHECK-DISABLED: attributes #[[ATTR2:[0-9]+]] = { nosync nounwind allocsize(0) }
+; CHECK-DISABLED: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="omp_no_openmp" }
+; CHECK-DISABLED: attributes #[[ATTR4]] = { nounwind }
 ;.
 ; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
 ; CHECK: [[META1:![0-9]+]] = !DIFile(filename: "remove_globalization.c", directory: "/tmp/remove_globalization.c")

diff  --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index 102c116180020..ff1f706b16f4f 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -132,6 +132,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: @[[BAZ_SPMD_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 2
 ; CHECK: @[[OFFSET:[a-zA-Z0-9_$"\\.-]+]] = global i32 undef
 ; CHECK: @[[STACK:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [1024 x i8] undef
+; CHECK: @[[FOO_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
+; CHECK: @[[BAR_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
+; CHECK: @[[BAZ_SPMD_NESTED_PARALLELISM:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
 ; CHECK: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] undef, align 4
 ; CHECK: @[[Y_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 4
 ;.
@@ -141,7 +144,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(ptr @[[GLOB1]], i8 1, i1 false)
 ; CHECK-NEXT:    [[X:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    call void @unknown_no_openmp()
-; CHECK-NEXT:    call void @use.internalized(ptr nofree [[X]]) #[[ATTR6]]
+; CHECK-NEXT:    call void @use.internalized(ptr nofree [[X]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR6]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 1)
 ; CHECK-NEXT:    ret void
@@ -154,14 +157,14 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C]], -1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[MASTER1:%.*]], label [[EXIT:%.*]]
 ; CHECK:       master1:
-; CHECK-NEXT:    call void @use.internalized(ptr nofree addrspacecast (ptr addrspace(3) @x_shared to ptr)) #[[ATTR6]]
+; CHECK-NEXT:    call void @use.internalized(ptr nofree addrspacecast (ptr addrspace(3) @x_shared to ptr)) #[[ATTR3]]
 ; CHECK-NEXT:    br label [[NEXT:%.*]]
 ; CHECK:       next:
 ; CHECK-NEXT:    call void @unknown_no_openmp()
 ; CHECK-NEXT:    [[B0:%.*]] = icmp eq i32 [[C]], -1
 ; CHECK-NEXT:    br i1 [[B0]], label [[MASTER2:%.*]], label [[EXIT]]
 ; CHECK:       master2:
-; CHECK-NEXT:    call void @use.internalized(ptr nofree addrspacecast (ptr addrspace(3) @y_shared to ptr)) #[[ATTR6]]
+; CHECK-NEXT:    call void @use.internalized(ptr nofree addrspacecast (ptr addrspace(3) @y_shared to ptr)) #[[ATTR3]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 1)
@@ -176,7 +179,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-NEXT:    br i1 [[C0]], label [[MASTER3:%.*]], label [[EXIT:%.*]]
 ; CHECK:       master3:
 ; CHECK-NEXT:    [[Z:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 24) #[[ATTR6]], !dbg [[DBG10:![0-9]+]]
-; CHECK-NEXT:    call void @use.internalized(ptr nofree [[Z]]) #[[ATTR6]]
+; CHECK-NEXT:    call void @use.internalized(ptr nofree [[Z]]) #[[ATTR3]]
 ; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[Z]], i64 24) #[[ATTR6]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -184,7 +187,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: nofree norecurse nounwind memory(write)
+; CHECK: Function Attrs: nofree norecurse nosync nounwind memory(write)
 ; CHECK-LABEL: define {{[^@]+}}@use.internalized
 ; CHECK-SAME: (ptr nofree [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -199,7 +202,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-NEXT:    ret void
 ;
 ;
-; CHECK: Function Attrs: nosync nounwind allocsize(0) memory(read)
+; CHECK: Function Attrs: norecurse nosync nounwind allocsize(0) memory(read)
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_alloc_shared
 ; CHECK-SAME: (i64 [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr @offset, align 4
@@ -213,9 +216,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ;
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "kernel" }
-; CHECK: attributes #[[ATTR1]] = { nofree norecurse nounwind memory(write) }
-; CHECK: attributes #[[ATTR2]] = { nosync nounwind allocsize(0) memory(read) }
-; CHECK: attributes #[[ATTR3:[0-9]+]] = { nosync nounwind }
+; CHECK: attributes #[[ATTR1]] = { nofree norecurse nosync nounwind memory(write) }
+; CHECK: attributes #[[ATTR2]] = { norecurse nosync nounwind allocsize(0) memory(read) }
+; CHECK: attributes #[[ATTR3]] = { nosync nounwind }
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { "llvm.assume"="omp_no_openmp" }
 ; CHECK: attributes #[[ATTR6]] = { nounwind }

diff  --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index daa77cc3fdafb..e7100dbf06822 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -2450,9 +2450,6 @@ attributes #11 = { convergent }
 ; AMDGPU: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
 ; AMDGPU: [[LOOP28]] = distinct !{!28, !23, !24}
 ; AMDGPU: [[LOOP29]] = distinct !{!29, !23, !24}
-; AMDGPU: [[META30:![0-9]+]] = !{!31, !27, i64 0}
-; AMDGPU: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0}
-; AMDGPU: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
 ;.
 ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -2484,9 +2481,6 @@ attributes #11 = { convergent }
 ; NVPTX: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
 ; NVPTX: [[LOOP28]] = distinct !{!28, !23, !24}
 ; NVPTX: [[LOOP29]] = distinct !{!29, !23, !24}
-; NVPTX: [[META30:![0-9]+]] = !{!31, !27, i64 0}
-; NVPTX: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0}
-; NVPTX: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
 ;.
 ; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -2518,9 +2512,6 @@ attributes #11 = { convergent }
 ; AMDGPU-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
 ; AMDGPU-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24}
 ; AMDGPU-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24}
-; AMDGPU-DISABLED: [[META30:![0-9]+]] = !{!31, !27, i64 0}
-; AMDGPU-DISABLED: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0}
-; AMDGPU-DISABLED: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
 ;.
 ; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -2552,7 +2543,4 @@ attributes #11 = { convergent }
 ; NVPTX-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
 ; NVPTX-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24}
 ; NVPTX-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24}
-; NVPTX-DISABLED: [[META30:![0-9]+]] = !{!31, !27, i64 0}
-; NVPTX-DISABLED: [[META31:![0-9]+]] = !{!"kmp_task_t_with_privates", !32, i64 0}
-; NVPTX-DISABLED: [[META32:![0-9]+]] = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
 ;.

diff  --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
index c65f8d8c8179a..c66d93d542c75 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
@@ -163,7 +163,7 @@ define weak void @__omp_offloading_2b_10393b5_generic_l20() #0 {
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    call void @generic_helper() #[[ATTR5]]
+; CHECK-NEXT:    call void @generic_helper() #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 2)
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
@@ -176,7 +176,7 @@ define weak void @__omp_offloading_2b_10393b5_generic_l20() #0 {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLE-SPMDIZATION:       user_code.entry:
-; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @generic_helper() #[[ATTR5]]
+; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @generic_helper() #[[ATTR6:[0-9]+]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__kmpc_target_deinit(ptr @[[GLOB1]], i8 1)
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ; CHECK-DISABLE-SPMDIZATION:       worker.exit:
@@ -202,7 +202,7 @@ define internal void @spmd_helper() #1 {
 ; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; CHECK-NEXT:    call void @leaf() #[[ATTR5]]
+; CHECK-NEXT:    call void @leaf() #[[ATTR6]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr @__omp_outlined___wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; CHECK-NEXT:    ret void
@@ -211,7 +211,7 @@ define internal void @spmd_helper() #1 {
 ; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR5]]
+; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR6]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) #[[ATTR2:[0-9]+]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr @__omp_outlined___wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
@@ -231,7 +231,7 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    call void @unknown() #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    call void @unknown() #[[ATTR7:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__omp_outlined__
@@ -239,7 +239,7 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @unknown() #[[ATTR6:[0-9]+]]
+; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @unknown() #[[ATTR7:[0-9]+]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
 entry:
@@ -333,13 +333,13 @@ define internal void @generic_helper() #1 {
 ; CHECK-LABEL: define {{[^@]+}}@generic_helper
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @leaf() #[[ATTR5]]
+; CHECK-NEXT:    call void @leaf() #[[ATTR6]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@generic_helper
 ; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR4]] {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
-; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR5]]
+; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR6]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
 entry:
@@ -376,17 +376,19 @@ attributes #5 = { convergent }
 ; CHECK: attributes #[[ATTR1]] = { convergent noinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
 ; CHECK: attributes #[[ATTR2]] = { nounwind }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { alwaysinline }
-; CHECK: attributes #[[ATTR4]] = { convergent noinline nounwind memory(write) "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+; CHECK: attributes #[[ATTR4]] = { convergent noinline nosync nounwind memory(write) "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
 ; CHECK: attributes #[[ATTR5]] = { convergent nounwind }
-; CHECK: attributes #[[ATTR6]] = { convergent }
+; CHECK: attributes #[[ATTR6]] = { convergent nosync nounwind }
+; CHECK: attributes #[[ATTR7]] = { convergent }
 ;.
 ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
 ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR1]] = { convergent noinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
 ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR2]] = { nounwind }
 ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR3:[0-9]+]] = { alwaysinline }
-; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR4]] = { convergent noinline nounwind memory(write) "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR4]] = { convergent noinline nosync nounwind memory(write) "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
 ; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR5]] = { convergent nounwind }
-; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR6]] = { convergent }
+; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR6]] = { convergent nosync nounwind }
+; CHECK-DISABLE-SPMDIZATION: attributes #[[ATTR7]] = { convergent }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0}
 ; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1}

diff  --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
index 686d487ff2a94..2234b695307d4 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
@@ -260,8 +260,8 @@ define internal void @spmd_helper() #1 {
 ; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; CHECK-NEXT:    call void @leaf() #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @leaf() #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr @__omp_outlined___wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -269,8 +269,8 @@ define internal void @spmd_helper() #1 {
 ; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR1:[0-9]+]] {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR3:[0-9]+]]
-; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) #[[ATTR3]]
+; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR7:[0-9]+]]
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]]) #[[ATTR3:[0-9]+]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__, ptr @__omp_outlined___wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
@@ -289,7 +289,7 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    call void @leaf() #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    call void @leaf() #[[ATTR7]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__omp_outlined__
@@ -297,7 +297,7 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
-; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR7:[0-9]+]]
+; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR7]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
 entry:
@@ -319,7 +319,7 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 {
 ; CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; CHECK-NEXT:    call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]]
+; CHECK-NEXT:    call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR7]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
@@ -330,7 +330,7 @@ define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) #2 {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR3]]
+; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @__omp_outlined__(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR7]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
 entry:
@@ -381,14 +381,14 @@ define internal void @generic_helper() #1 {
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @unknown()
-; CHECK-NEXT:    call void @leaf() #[[ATTR3]]
+; CHECK-NEXT:    call void @leaf() #[[ATTR7]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@generic_helper
 ; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @unknown()
-; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR3]]
+; CHECK-DISABLE-SPMDIZATION-NEXT:    call void @leaf() #[[ATTR7]]
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
 entry:

diff  --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
index 5bc6704ffc67e..008e8a2e565df 100644
--- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
+++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
@@ -19,53 +19,30 @@ target triple = "amdgcn-amd-amdhsa"
 ; CGSCC: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
 ;.
 define void @kernel() "kernel" {
-; TUNIT: Function Attrs: norecurse
-; TUNIT-LABEL: define {{[^@]+}}@kernel
-; TUNIT-SAME: () #[[ATTR0:[0-9]+]] {
-; TUNIT-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(ptr undef, i8 1, i1 false)
-; TUNIT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], -1
-; TUNIT-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; TUNIT:       if.then:
-; TUNIT-NEXT:    store i32 1, ptr addrspace(3) @G, align 4
-; TUNIT-NEXT:    br label [[IF_MERGE:%.*]]
-; TUNIT:       if.else:
-; TUNIT-NEXT:    call void @barrier() #[[ATTR5:[0-9]+]]
-; TUNIT-NEXT:    [[L:%.*]] = load i32, ptr addrspace(3) @G, align 4
-; TUNIT-NEXT:    call void @use1(i32 [[L]]) #[[ATTR5]]
-; TUNIT-NEXT:    br label [[IF_MERGE]]
-; TUNIT:       if.merge:
-; TUNIT-NEXT:    br i1 [[CMP]], label [[IF_THEN2:%.*]], label [[IF_END:%.*]]
-; TUNIT:       if.then2:
-; TUNIT-NEXT:    store i32 2, ptr addrspace(3) @G, align 4
-; TUNIT-NEXT:    call void @barrier() #[[ATTR5]]
-; TUNIT-NEXT:    br label [[IF_END]]
-; TUNIT:       if.end:
-; TUNIT-NEXT:    call void @__kmpc_target_deinit(ptr undef, i8 1)
-; TUNIT-NEXT:    ret void
 ;
-; CGSCC: Function Attrs: norecurse
-; CGSCC-LABEL: define {{[^@]+}}@kernel
-; CGSCC-SAME: () #[[ATTR0:[0-9]+]] {
-; CGSCC-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(ptr undef, i8 1, i1 false)
-; CGSCC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], -1
-; CGSCC-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CGSCC:       if.then:
-; CGSCC-NEXT:    store i32 1, ptr addrspace(3) @G, align 4
-; CGSCC-NEXT:    br label [[IF_MERGE:%.*]]
-; CGSCC:       if.else:
-; CGSCC-NEXT:    call void @barrier()
-; CGSCC-NEXT:    [[L:%.*]] = load i32, ptr addrspace(3) @G, align 4
-; CGSCC-NEXT:    call void @use1(i32 [[L]])
-; CGSCC-NEXT:    br label [[IF_MERGE]]
-; CGSCC:       if.merge:
-; CGSCC-NEXT:    br i1 [[CMP]], label [[IF_THEN2:%.*]], label [[IF_END:%.*]]
-; CGSCC:       if.then2:
-; CGSCC-NEXT:    store i32 2, ptr addrspace(3) @G, align 4
-; CGSCC-NEXT:    call void @barrier()
-; CGSCC-NEXT:    br label [[IF_END]]
-; CGSCC:       if.end:
-; CGSCC-NEXT:    call void @__kmpc_target_deinit(ptr undef, i8 1)
-; CGSCC-NEXT:    ret void
+; CHECK: Function Attrs: norecurse
+; CHECK-LABEL: define {{[^@]+}}@kernel
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(ptr undef, i8 1, i1 false)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], -1
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 1, ptr addrspace(3) @G, align 4
+; CHECK-NEXT:    br label [[IF_MERGE:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    call void @barrier() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[L:%.*]] = load i32, ptr addrspace(3) @G, align 4
+; CHECK-NEXT:    call void @use1(i32 [[L]]) #[[ATTR5]]
+; CHECK-NEXT:    br label [[IF_MERGE]]
+; CHECK:       if.merge:
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN2:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then2:
+; CHECK-NEXT:    store i32 2, ptr addrspace(3) @G, align 4
+; CHECK-NEXT:    call void @barrier() #[[ATTR5]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    call void @__kmpc_target_deinit(ptr undef, i8 1)
+; CHECK-NEXT:    ret void
 ;
   %call = call i32 @__kmpc_target_init(ptr undef, i8 1, i1 false)
   %cmp = icmp eq i32 %call, -1
@@ -112,20 +89,17 @@ declare void @llvm.assume(i1)
 !2 = !{ptr @kernel, !"kernel", i32 1}
 
 ;.
-; TUNIT: attributes #[[ATTR0]] = { norecurse "kernel" }
-; TUNIT: attributes #[[ATTR1:[0-9]+]] = { nocallback norecurse nounwind }
-; TUNIT: attributes #[[ATTR2:[0-9]+]] = { nocallback norecurse nosync nounwind }
-; TUNIT: attributes #[[ATTR3:[0-9]+]] = { nocallback }
-; TUNIT: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
-; TUNIT: attributes #[[ATTR5]] = { nounwind }
-;.
-; CGSCC: attributes #[[ATTR0]] = { norecurse "kernel" }
-; CGSCC: attributes #[[ATTR1:[0-9]+]] = { nocallback norecurse nounwind }
-; CGSCC: attributes #[[ATTR2:[0-9]+]] = { nocallback norecurse nosync nounwind }
-; CGSCC: attributes #[[ATTR3:[0-9]+]] = { nocallback }
-; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+; CHECK: attributes #[[ATTR0]] = { norecurse "kernel" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback norecurse nounwind }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback norecurse nosync nounwind }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
+; CHECK: attributes #[[ATTR5]] = { nounwind }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CHECK: [[META2:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
 ;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CGSCC: {{.*}}
+; TUNIT: {{.*}}


        


More information about the llvm-commits mailing list