[Mlir-commits] [clang] [llvm] [mlir] [OpenMP] Migrate GPU Reductions CodeGen from Clang to OMPIRBuilder (PR #80343)

Wed Feb 14 08:11:54 PST 2024

https://github.com/TIFitis updated https://github.com/llvm/llvm-project/pull/80343

>From f07916c9ab9629a77201377ac16e543f56fabab3 Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin at yahoo.com>
Date: Mon, 15 Jan 2024 11:34:09 +0000
Subject: [PATCH 01/10] Jan migration work

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  109 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 1243 ++++++++++++++++-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   11 +
 3 files changed, 1315 insertions(+), 48 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2288969ecc95c4..e940d9f655c869 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/Allocator.h"
@@ -99,14 +100,22 @@ class OpenMPIRBuilderConfig {
   /// expanded.
   std::optional<bool> IsGPU;
 
-  // Flag for specifying if offloading is mandatory.
+  /// Flag for specifying if offloading is mandatory.
   std::optional<bool> OpenMPOffloadMandatory;
 
+  /// Name of the target processor.
+  StringRef TargetCPU;
+  /// String representation of the target processor's features.
+  StringRef TargetFeatures;
+
   /// First separator used between the initial two parts of a name.
   std::optional<StringRef> FirstSeparator;
   /// Separator used between all of the rest consecutive parts of s name
   std::optional<StringRef> Separator;
 
+  // Grid Value for the GPU target
+  std::optional<omp::GV> GridValue;
+
   OpenMPIRBuilderConfig();
   OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
                         bool OpenMPOffloadMandatory,
@@ -132,6 +141,11 @@ class OpenMPIRBuilderConfig {
     return *OpenMPOffloadMandatory;
   }
 
+  omp::GV getGridValue() const {
+    assert(GridValue.has_value() && "GridValue is not set");
+    return *GridValue;
+  }
+
   bool hasRequiresFlags() const { return RequiresFlags; }
   bool hasRequiresReverseOffload() const;
   bool hasRequiresUnifiedAddress() const;
@@ -167,6 +181,7 @@ class OpenMPIRBuilderConfig {
   void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
   void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
   void setSeparator(StringRef S) { Separator = S; }
+  void setGridValue(omp::GV G) { GridValue = G; }
 
   void setHasRequiresReverseOffload(bool Value);
   void setHasRequiresUnifiedAddress(bool Value);
@@ -1233,18 +1248,26 @@ class OpenMPIRBuilder {
   getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
                            StringRef ParentName = "");
 
+  // using ReductionGenTy =
+  //     function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
+
+  // using AtomicReductionGenTy =
+  //     function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+
+  /// Owning equivalents of OpenMPIRBuilder::(Atomic)ReductionGen that are used
+  /// to
+  /// store lambdas with capture.
   /// Functions used to generate reductions. Such functions take two Values
   /// representing LHS and RHS of the reduction, respectively, and a reference
   /// to the value that is updated to refer to the reduction result.
-  using ReductionGenTy =
-      function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
-
+  using ReductionGenTy = std::function<OpenMPIRBuilder::InsertPointTy(
+      OpenMPIRBuilder::InsertPointTy, Value *, Value *, Value *&)>;
   /// Functions used to generate atomic reductions. Such functions take two
   /// Values representing pointers to LHS and RHS of the reduction, as well as
   /// the element type of these pointers. They are expected to atomically
   /// update the LHS to the reduced value.
-  using AtomicReductionGenTy =
-      function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+  using AtomicReductionGenTy = std::function<OpenMPIRBuilder::InsertPointTy(
+      OpenMPIRBuilder::InsertPointTy, Type *, Value *, Value *)>;
 
   /// Information about an OpenMP reduction.
   struct ReductionInfo {
@@ -1254,6 +1277,10 @@ class OpenMPIRBuilder {
         : ElementType(ElementType), Variable(Variable),
           PrivateVariable(PrivateVariable), ReductionGen(ReductionGen),
           AtomicReductionGen(AtomicReductionGen) {}
+    ReductionInfo(Value *PrivateVariable)
+        : ElementType(nullptr), Variable(nullptr),
+          PrivateVariable(PrivateVariable), ReductionGen(),
+          AtomicReductionGen() {}
 
     /// Reduction element type, must match pointee type of variable.
     Type *ElementType;
@@ -1276,6 +1303,69 @@ class OpenMPIRBuilder {
     AtomicReductionGenTy AtomicReductionGen;
   };
 
+  /// A class that manages the reduction info to facilitate lowering of
+  /// reductions at multiple levels of parallelism. For example handling teams
+  /// and parallel reductions on GPUs
+
+  class ReductionInfoManager {
+  private:
+    SmallVector<ReductionInfo> ReductionInfos;
+    std::optional<InsertPointTy> PrivateVarAllocaIP;
+
+  public:
+    ReductionInfoManager(){};
+    void clear() {
+      ReductionInfos.clear();
+      PrivateVarAllocaIP.reset();
+    }
+
+    Value *
+    allocatePrivateReductionVar(IRBuilderBase &builder,
+                                llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+                                Type *VarType) {
+      llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext());
+      llvm::Value *var = builder.CreateAlloca(VarType);
+      var->setName("private_redvar");
+      llvm::Value *castVar =
+          builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+      ReductionInfos.push_back(ReductionInfo(castVar));
+      return castVar;
+    }
+
+    ReductionInfo getReductionInfo(unsigned Index) {
+      return ReductionInfos[Index];
+    }
+    ReductionInfo setReductionInfo(unsigned Index, ReductionInfo &RI) {
+      return ReductionInfos[Index] = RI;
+    }
+    Value *getPrivateReductionVariable(unsigned Index) {
+      return ReductionInfos[Index].PrivateVariable;
+    }
+    SmallVector<ReductionInfo> &getReductionInfos() { return ReductionInfos; }
+
+    bool hasPrivateVarAllocaIP() { return PrivateVarAllocaIP.has_value(); }
+    InsertPointTy getPrivateVarAllocaIP() {
+      assert(PrivateVarAllocaIP.has_value() && "AllocaIP not set");
+      return *PrivateVarAllocaIP;
+    }
+    void setPrivateVarAllocaIP(InsertPointTy IP) { PrivateVarAllocaIP = IP; }
+  };
+
+  /// \param Loc                The location where the reduction was
+  ///                           encountered. Must be within the associate
+  ///                           directive and after the last local access to the
+  ///                           reduction variables.
+  /// \param AllocaIP           An insertion point suitable for allocas usable
+  ///                           in reductions.
+  /// \param ReductionInfos     A list of info on each reduction variable.
+  /// \param IsNoWait           A flag set if the reduction is marked as nowait.
+  InsertPointTy createReductionsGPU(const LocationDescription &Loc,
+                                    InsertPointTy AllocaIP,
+                                    ArrayRef<ReductionInfo> ReductionInfos,
+                                    bool IsNoWait = false,
+                                    bool IsTeamsReduction = false,
+                                    bool HasDistribute = false);
+
   // TODO: provide atomic and non-atomic reduction generators for reduction
   // operators defined by the OpenMP specification.
 
@@ -1340,7 +1430,9 @@ class OpenMPIRBuilder {
   InsertPointTy createReductions(const LocationDescription &Loc,
                                  InsertPointTy AllocaIP,
                                  ArrayRef<ReductionInfo> ReductionInfos,
-                                 bool IsNoWait = false);
+                                 bool IsNoWait = false,
+                                 bool IsTeamsReduction = false,
+                                 bool HasDistribute = false);
 
   ///}
 
@@ -1481,6 +1573,9 @@ class OpenMPIRBuilder {
   /// Info manager to keep track of target regions.
   OffloadEntriesInfoManager OffloadInfoManager;
 
+  /// Info manager to keep track of reduction information;
+  ReductionInfoManager RIManager;
+
   /// The target triple of the underlying module.
   const Triple T;
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 02b333e9ccd567..b220aab6700e20 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -145,10 +145,19 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
 }
 #endif
 
-static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
+Function *GLOBAL_ReductionFunc = nullptr;
+
+static uint64_t getTypeSizeInBytes(Module &M, Type *Type) {
+  return divideCeil(M.getDataLayout().getTypeSizeInBits(Type), 8);
+}
+
+static Value *getTypeSizeInBytesValue(IRBuilder<> &Builder, Module &M,
+                                      Type *Type) {
+  return Builder.getInt64(getTypeSizeInBytes(M, Type));
+}
+
+static const omp::GV &getGridValue(const Triple &T, StringRef Features) {
   if (T.isAMDGPU()) {
-    StringRef Features =
-        Kernel->getFnAttribute("target-features").getValueAsString();
     if (Features.count("+wavefrontsize64"))
       return omp::getAMDGPUGridValues<64>();
     return omp::getAMDGPUGridValues<32>();
@@ -158,6 +167,17 @@ static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
   llvm_unreachable("No grid value available for this architecture!");
 }
 
+static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
+  if (T.isAMDGPU()) {
+    StringRef Features =
+        Kernel->getFnAttribute("target-features").getValueAsString();
+    return getGridValue(T, Features);
+  }
+  if (T.isNVPTX())
+    return omp::NVPTXGridValues;
+  llvm_unreachable("No grid value available for this architecture!");
+}
+
 /// Determine which scheduling algorithm to use, determined from schedule clause
 /// arguments.
 static OMPScheduleType
@@ -2051,36 +2071,1182 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc,
                               /*IsCancellable*/ true);
 }
 
+static Value *getGPUWarpSize(Module &M, OpenMPIRBuilder &OMPBuilder) {
+  return OMPBuilder.Builder.CreateCall(
+      OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size),
+      {});
+}
+
+static Value *getGPUThreadID(Module &M, OpenMPIRBuilder &OMPBuilder) {
+  return OMPBuilder.Builder.CreateCall(
+      OMPBuilder.getOrCreateRuntimeFunction(
+          M, OMPRTL___kmpc_get_hardware_thread_id_in_block),
+      {});
+}
+
+static Value *getGPUNumThreads(Module &M, OpenMPIRBuilder &OMPBuilder) {
+  const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
+  llvm::Function *F = M.getFunction(LocSize);
+  if (!F) {
+    LLVMContext &Ctx = M.getContext();
+    Type *I32Type = Type::getInt32Ty(Ctx);
+
+    F = Function::Create(FunctionType::get(I32Type, std::nullopt, false),
+                         GlobalVariable::ExternalLinkage, LocSize, M);
+  }
+  return OMPBuilder.Builder.CreateCall(F, std::nullopt, "nvptx_num_threads");
+}
+
+static Value *getNVPTXWarpID(Module &M, OpenMPIRBuilder &OMPIRBuilder) {
+  unsigned LaneIDBits =
+      llvm::Log2_32(OMPIRBuilder.Config.getGridValue().GV_Warp_Size);
+  return OMPIRBuilder.Builder.CreateAShr(getGPUThreadID(M, OMPIRBuilder),
+                                         LaneIDBits, "nvptx_warp_id");
+}
+
+static Value *getNVPTXLaneID(Module &M, OpenMPIRBuilder &OMPIRBuilder) {
+   unsigned LaneIDBits =
+     llvm::Log2_32(OMPIRBuilder.Config.getGridValue().GV_Warp_Size);
+  assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
+  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
+  return OMPIRBuilder.Builder.CreateAnd(
+      getGPUThreadID(M, OMPIRBuilder),
+      OMPIRBuilder.Builder.getInt32(LaneIDMask), "nvptx_lane_id");
+}
+
+namespace {
+enum CopyAction : unsigned {
+  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
+  // the warp using shuffle instructions.
+  RemoteLaneToThread,
+  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
+  ThreadCopy,
+};
+} // namespace
+
+struct CopyOptionsTy {
+  llvm::Value *RemoteLaneOffset;
+  llvm::Value *ScratchpadIndex;
+  llvm::Value *ScratchpadWidth;
+};
+
+static Value *castValueToType(Module &M, OpenMPIRBuilder &OMPBuilder,
+                              Value *From, Type *ToType,
+                              OpenMPIRBuilder::InsertPointTy AllocaIP,
+                              const OpenMPIRBuilder::LocationDescription &Loc) {
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  Type *FromType = From->getType();
+  uint64_t FromSize =
+      divideCeil(M.getDataLayout().getTypeSizeInBits(FromType), 8);
+  uint64_t ToSize = divideCeil(M.getDataLayout().getTypeSizeInBits(ToType), 8);
+  assert(FromSize > 0 && "From size must be greater than zero");
+  assert(ToSize > 0 && "From size must be greater than zero");
+  if (FromType == ToType)
+    return From;
+  if (FromSize == ToSize)
+    return Builder.CreateBitCast(From, ToType);
+  if (ToType->isIntegerTy() && FromType->isIntegerTy())
+    // FIXME(JAN): Assuming signed integer here, not sure how to find out
+    // if unsigned
+    return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
+  OpenMPIRBuilder::InsertPointTy CurIP = Builder.saveIP();
+  Builder.restoreIP(AllocaIP);
+  Value *CastItem = Builder.CreateAlloca(ToType, nullptr, "cast_tmp");
+  Builder.restoreIP(CurIP);
+
+  Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      CastItem, FromType->getPointerTo(), "valcastitem");
+  Builder.CreateStore(From, ValCastItem);
+  return Builder.CreateLoad(ToType, CastItem, "castitemload");
+}
+
+static Value *
+createRuntimeShuffleFunction(Module &M, OpenMPIRBuilder &OMPBuilder,
+                             const OpenMPIRBuilder::LocationDescription &Loc,
+                             OpenMPIRBuilder::InsertPointTy AllocaIP,
+                             Value *Element, Type *ElementType, Value *Offset) {
+  LLVMContext &Ctx = M.getContext();
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  uint64_t Size =
+      divideCeil(M.getDataLayout().getTypeSizeInBits(ElementType), 8);
+  assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
+  Function *ShuffleFunc = OMPBuilder.getOrCreateRuntimeFunctionPtr(
+      Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
+                : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
+  Type *IntType = Builder.getIntNTy(Size <= 4 ? 32 : 64);
+  Value *ElemCast = Builder.CreateCast(Instruction::SExt, Element, IntType);
+  Value *WarpSize = getGPUWarpSize(M, OMPBuilder);
+  Value *WarpSizeCast =
+      Builder.CreateIntCast(WarpSize, Type::getInt16Ty(Ctx), /*isSigned=*/true);
+  Value *ShuffleCall =
+      Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
+  return castValueToType(M, OMPBuilder, ShuffleCall, IntType, AllocaIP, Loc);
+}
+
+static void shuffleAndStore(Value *SrcAddr, Value *DstAddr, Type *ElementType,
+                            llvm::Value *Offset, Type *ReductionArrayTy,
+                            const OpenMPIRBuilder::LocationDescription &Loc,
+                            Module &M, OpenMPIRBuilder &OMPBuilder,
+                            OpenMPIRBuilder::InsertPointTy AllocaIP) {
+  LLVMContext &Ctx = M.getContext();
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  uint64_t Size =
+      divideCeil(M.getDataLayout().getTypeSizeInBits(ElementType), 8);
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Value *ElemPtr = DstAddr;
+  Value *Ptr = SrcAddr;
+  //  Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
+  // Builder.CreateConstGEP1_64(ReductionArrayTy, SrcAddr, 1), PtrTy);
+  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
+    if (Size < IntSize)
+      continue;
+    // FIXME(JAN): Check if there is a function to convert from bytes to bits
+    Type *IntTy = Builder.getIntNTy(IntSize * 8);
+    Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        Ptr, IntTy->getPointerTo(), "ptrcast");
+    ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        ElemPtr, IntTy->getPointerTo(), "elemptrcast");
+
+    // FIXME(JAN): Implement loop to handle larger size
+    assert(((Size / IntSize) <= 1) && "Unsupported IntSize");
+    Value *Val = Builder.CreateLoad(IntTy, Ptr);
+    Value *Res = createRuntimeShuffleFunction(M, OMPBuilder, Loc, AllocaIP, Val,
+                                              IntTy, Offset);
+    Builder.CreateStore(Res, ElemPtr);
+    Ptr = Builder.CreateConstGEP1_64(ReductionArrayTy, Ptr, 1, "ptrgep");
+    ElemPtr =
+        Builder.CreateConstGEP1_64(ReductionArrayTy, ElemPtr, 1, "elemptrgep");
+    Size = Size % IntSize;
+  }
+}
+
+static void
+emitReductionListCopy(CopyAction Action, Type *ReductionArrayTy,
+                      ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+                      Value *SrcBase, Value *DestBase, Module &M,
+                      OpenMPIRBuilder &OMPBuilder,
+                      const OpenMPIRBuilder::LocationDescription &Loc,
+                      OpenMPIRBuilder::InsertPointTy AllocaIP,
+                      CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
+  LLVMContext &Ctx = M.getContext();
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+
+  Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
+
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    Value *SrcElementAddr = nullptr;
+    Value *DestElementAddr = nullptr;
+    Value *DestElementPtrAddr = nullptr;
+    bool ShuffleInElement = false;
+    bool UpdateDestListPtr = false;
+
+    // Step 1.1: Get the address for the src element in the Reduce list.
+    Value *SrcElementPtrAddr = Builder.CreateConstGEP2_64(
+        ReductionArrayTy, SrcBase, 0, En.index(), "srcelementptraddr");
+    SrcElementAddr =
+        Builder.CreateLoad(PtrTy, SrcElementPtrAddr, "srcelementaddr");
+
+    // Step 1.2: Create a temporary to store the element in the destination
+    // Reduce list.
+    DestElementPtrAddr = Builder.CreateInBoundsGEP(
+        ReductionArrayTy, DestBase,
+        {Builder.getInt64(0), Builder.getInt64(En.index())},
+        "destelementptraddr");
+    switch (Action) {
+    case RemoteLaneToThread: {
+      OpenMPIRBuilder::InsertPointTy CurIP = Builder.saveIP();
+      Builder.restoreIP(AllocaIP);
+      DestElementAddr = Builder.CreateAlloca(RI.ElementType, nullptr,
+                                             ".omp.reduction.element");
+      Builder.restoreIP(CurIP);
+      ShuffleInElement = true;
+      UpdateDestListPtr = true;
+      break;
+    }
+    case ThreadCopy: {
+      DestElementAddr =
+          Builder.CreateLoad(PtrTy, DestElementPtrAddr, "destelementaddr");
+      break;
+    }
+    }
+
+    // FIXME(JAN): Original code in clanguses <Addr>.withElementType(...)
+    // check if this generates any code
+
+    if (ShuffleInElement) {
+      shuffleAndStore(SrcElementAddr, DestElementAddr, RI.ElementType,
+                      RemoteLaneOffset, ReductionArrayTy, Loc, M, OMPBuilder,
+                      AllocaIP);
+    } else {
+      // FIXME(JAN): Assume Scalar here (TEK_Scalar in Clang)
+      Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
+      Builder.CreateStore(Elem, DestElementAddr);
+    }
+    // Step 3.1: Modify reference in dest Reduce list as needed.
+    // Modifying the reference in Reduce list to point to the newly
+    // created element.  The element is live in the current function
+    // scope and that of functions it invokes (i.e., reduce_function).
+    // RemoteReduceData[i] = (void*)&RemoteElem
+    if (UpdateDestListPtr) {
+      Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+          DestElementAddr, PtrTy, "castdestaddr");
+      Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
+    }
+  }
+}
+
+static OpenMPIRBuilder::InsertPointTy getIPAfterInstr(Instruction *I) {
+  BasicBlock::iterator it(I);
+  it++;
+  return OpenMPIRBuilder::InsertPointTy(I->getParent(), it);
+}
+
+static Function *emitShuffleAndReduceFunction(
+    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
+    OpenMPIRBuilder &OMPBuilder) {
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+
+  LLVMContext &Ctx = M.getContext();
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *I16Type = Type::getInt16Ty(Ctx);
+  auto FuncTy = FunctionType::get(VoidTy, {PtrTy, I16Type, I16Type, I16Type},
+                                  /* IsVarArg */ false);
+  Function *SarFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_shuffle_and_reduce_func", &M);
+  SarFunc->setDoesNotRecurse();
+
+  // Set arg names
+  Argument *Arg0 = SarFunc->getArg(0);
+  Argument *Arg1 = SarFunc->getArg(1);
+  Argument *Arg2 = SarFunc->getArg(2);
+  Argument *Arg3 = SarFunc->getArg(3);
+  Arg0->setName("reduce_list_arg");
+  Arg1->setName("lane_id_arg");
+  Arg2->setName("remote_lane_offset_arg");
+  Arg3->setName("algo_ver_arg");
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", SarFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  Type *Arg0Type = Arg0->getType();
+  Type *ArgNType = Arg1->getType();
+  Type *ArgNPtrType = Arg1->getType()->getPointerTo();
+  Value *ReduceListAlloca =
+      Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+  Value *LaneIdAlloca =
+      Builder.CreateAlloca(ArgNType, nullptr, Arg1->getName() + ".addr");
+  Value *RemoteLaneOffsetAlloca =
+      Builder.CreateAlloca(ArgNType, nullptr, Arg2->getName() + ".addr");
+  Value *AlgoVerAlloca =
+      Builder.CreateAlloca(ArgNType, nullptr, Arg3->getName() + ".addr");
+  // FIXME(Jan): Compute reduction list array type
+  auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+  Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
+      RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
+
+  Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".acast");
+  Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LaneIdAlloca, ArgNPtrType, LaneIdAlloca->getName() + ".acast");
+  Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteLaneOffsetAlloca, ArgNPtrType,
+      RemoteLaneOffsetAlloca->getName() + ".acast");
+  Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      AlgoVerAlloca, ArgNPtrType, AlgoVerAlloca->getName() + ".acast");
+  Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteReductionListAlloca, PtrTy,
+      RemoteReductionListAlloca->getName() + ".acast");
+
+  Builder.CreateStore(Arg0, ReduceListAddrCast);
+  Builder.CreateStore(Arg1, LaneIdAddrCast);
+  Builder.CreateStore(Arg2, RemoteLaneOffsetAddrCast);
+  Builder.CreateStore(Arg3, AlgoVerAddrCast);
+
+  Value *ReduceList =
+      Builder.CreateLoad(Arg0Type, ReduceListAddrCast, "reduce_list");
+  Value *LaneId = Builder.CreateLoad(ArgNType, LaneIdAddrCast, "lane_id");
+  Value *RemoteLaneOffset = Builder.CreateLoad(
+      ArgNType, RemoteLaneOffsetAddrCast, "remote_lane_offset");
+  Value *AlgoVer = Builder.CreateLoad(ArgNType, AlgoVerAddrCast, "algo_ver");
+
+  OpenMPIRBuilder::InsertPointTy AllocaIP =
+      getIPAfterInstr(RemoteReductionListAlloca);
+  emitReductionListCopy(RemoteLaneToThread, RedListArrayTy, ReductionInfos,
+                        ReduceList, RemoteListAddrCast, M, OMPBuilder, Loc,
+                        AllocaIP, {RemoteLaneOffset, nullptr, nullptr});
+
+  // The actions to be performed on the Remote Reduce list is dependent
+  // on the algorithm version.
+  //
+  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
+  //  LaneId % 2 == 0 && Offset > 0):
+  //    do the reduction value aggregation
+  //
+  //  The thread local variable Reduce list is mutated in place to host the
+  //  reduced data, which is the aggregated value produced from local and
+  //  remote lanes.
+  //
+  //  Note that AlgoVer is expected to be a constant integer known at compile
+  //  time.
+  //  When AlgoVer==0, the first conjunction evaluates to true, making
+  //    the entire predicate true during compile time.
+  //  When AlgoVer==1, the second conjunction has only the second part to be
+  //    evaluated during runtime.  Other conjunctions evaluates to false
+  //    during compile time.
+  //  When AlgoVer==2, the third conjunction has only the second part to be
+  //    evaluated during runtime.  Other conjunctions evaluates to false
+  //    during compile time.
+  Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
+  Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+  Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
+  Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
+  Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
+  Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
+  Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
+  Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
+  Value *RemoteOffsetComp =
+      Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
+  Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
+  Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
+  Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
+
+  BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then", SarFunc);
+  BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else", SarFunc);
+  BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont", SarFunc);
+
+  Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
+  Builder.SetInsertPoint(ThenBB);
+  // reduce_function(LocalReduceList, RemoteReduceList)
+  Value *LocalReduceListPtr =
+      Builder.CreatePointerBitCastOrAddrSpaceCast(ReduceList, PtrTy);
+  Value *RemoteReduceListPtr =
+      Builder.CreatePointerBitCastOrAddrSpaceCast(RemoteListAddrCast, PtrTy);
+  Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
+  Builder.CreateBr(MergeBB);
+  Builder.SetInsertPoint(ElseBB);
+  Builder.CreateBr(MergeBB);
+  Builder.SetInsertPoint(MergeBB);
+
+  Value *Algo1_2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+  Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
+  Value *CondCopy = Builder.CreateAnd(Algo1_2, LaneIdGtOffset);
+
+  BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "cpy_then", SarFunc);
+  BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "cpy_else", SarFunc);
+  BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "cpy_ifcont", SarFunc);
+
+  Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
+
+  Builder.SetInsertPoint(CpyThenBB);
+  emitReductionListCopy(ThreadCopy, RedListArrayTy, ReductionInfos,
+                        RemoteListAddrCast, ReduceList, M, OMPBuilder, Loc,
+                        AllocaIP);
+  Builder.CreateBr(CpyMergeBB);
+  Builder.SetInsertPoint(CpyElseBB);
+  Builder.CreateBr(CpyMergeBB);
+  Builder.SetInsertPoint(CpyMergeBB);
+  Builder.CreateRetVoid();
+
+  return SarFunc;
+}
+
+static Function *emitInterWarpCopyFunction(
+    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+    OpenMPIRBuilder &OMPBuilder) {
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *I32Type = Type::getInt32Ty(Ctx);
+  auto FuncTy =
+      FunctionType::get(VoidTy, {PtrTy, I32Type}, /* IsVarArg */ false);
+  Function *WcFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_inter_warp_copy_func", &M);
+  WcFunc->setDoesNotRecurse();
+
+  // Set arg names
+  Argument *Arg0 = WcFunc->getArg(0);
+  Argument *Arg1 = WcFunc->getArg(1);
+  Arg0->setName("reduce_list");
+  Arg1->setName("num_warps");
+
+  // Ensure data transfer storage
+  unsigned WarpSize = OMPBuilder.Config.getGridValue().GV_Warp_Size;
+  // FIXME(Jan): Not sure about the array type here, but it is I32 in Clang
+  auto *ArrayTy = ArrayType::get(I32Type, WarpSize);
+  StringRef TransferMediumName =
+      "__openmp_nvptx_data_transfer_temporary_storage";
+  GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
+  if (!TransferMedium) {
+    unsigned SharedAddressSpace =
+        3; /* FIXME(Jan): C.getTargetAddressSpace(LangAS::cuda_shared); */
+    TransferMedium = new GlobalVariable(
+        M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
+        UndefValue::get(ArrayTy), TransferMediumName,
+        /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
+        SharedAddressSpace);
+  }
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", WcFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  Type *Arg0Type = Arg0->getType();
+  Type *Arg1Type = Arg1->getType();
+  Value *ReduceListAlloca =
+      Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+  Instruction *NumWarpsAlloca =
+      Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+  Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".acast");
+  Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      NumWarpsAlloca, Arg1Type->getPointerTo(),
+      NumWarpsAlloca->getName() + ".acast");
+  Builder.CreateStore(Arg0, ReduceListAddrCast);
+  Builder.CreateStore(Arg1, NumWarpsAddrCast);
+
+  // Get GPU Info
+  Value *ThreadID = getGPUThreadID(M, OMPBuilder);
+  Value *LaneID = getNVPTXLaneID(M, OMPBuilder);
+  Value *WarpID = getNVPTXWarpID(M, OMPBuilder);
+
+  Value *ReduceListArg =
+      Builder.CreateLoad(PtrTy, ReduceListAddrCast, "reduce_list_arg");
+
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    Type *ElementTy = RI.ElementType;
+    unsigned NumTypeBits = M.getDataLayout().getTypeSizeInBits(ElementTy);
+    unsigned RealTySize = divideCeil(NumTypeBits, 8);
+    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
+      unsigned NumIters = RealTySize / TySize;
+      if (NumIters == 0)
+        continue;
+      //      Type *CopyTy = Builder.getIntNTy(TySize);
+      Type *Int32Ty = Builder.getInt32Ty();
+      Value *Cnt = nullptr;
+      Value *CntAddrAcast = nullptr;
+      BasicBlock *PrecondBB = nullptr;
+      BasicBlock *ExitBB = nullptr;
+
+      if (NumIters > 1) {
+        OpenMPIRBuilder::InsertPointTy CurrIP = Builder.saveIP();
+        Builder.SetInsertPoint(NumWarpsAlloca);
+        Value *CntAddr = Builder.CreateAlloca(Int32Ty, nullptr, ".cnt.addr");
+        Builder.restoreIP(CurrIP);
+        CntAddrAcast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+            CntAddr, PtrTy, CntAddr->getName() + ".acast");
+        Builder.CreateStore(Constant::getNullValue(Int32Ty), CntAddrAcast);
+        PrecondBB = BasicBlock::Create(Ctx, "precond", WcFunc);
+        ExitBB = BasicBlock::Create(Ctx, "exit", WcFunc);
+        BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body", WcFunc);
+        Builder.CreateBr(PrecondBB);
+        Builder.SetInsertPoint(PrecondBB);
+        Cnt = Builder.CreateLoad(Int32Ty, CntAddrAcast, "cnt");
+        Value *Cmp = Builder.CreateICmpULT(Cnt, Builder.getInt32(NumIters));
+        Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
+        Builder.SetInsertPoint(BodyBB);
+      }
+
+      OMPBuilder.createBarrier(
+          OpenMPIRBuilder::LocationDescription(Builder.saveIP(), Loc.DL),
+          omp::Directive::OMPD_unknown,
+          /* ForceSimpleCall */ false,
+          /* CheckCancelFlag */ true);
+      BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then", WcFunc);
+      BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else", WcFunc);
+      BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont", WcFunc);
+
+      // if (lane_id  == 0)
+      Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
+      Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+
+      // then
+      // Reduce element = LocalReduceList[i]
+      Builder.SetInsertPoint(ThenBB);
+      // FIXME(JAN): Should array type be passed in?
+      auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+      // FIXME(JAN): maybe it should be 0,0 and not use En.index()
+      Value *ReduceListElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+          RedListArrayTy, ReduceListArg, 0, En.index());
+      Value *ReduceListElementPtr = Builder.CreateLoad(
+          PtrTy, ReduceListElementPtrPtr, "reduce_list_element_ptr");
+      if (NumIters > 1)
+        ReduceListElementPtr =
+            Builder.CreateGEP(Int32Ty, ReduceListElementPtr, Cnt);
+
+      Value *TransferElemAddr = Builder.CreateInBoundsGEP(
+          ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
+      Value *ReduceListElement = Builder.CreateLoad(
+          I32Type, ReduceListElementPtr, "reduce_list_element");
+      Builder.CreateStore(ReduceListElement, TransferElemAddr,
+                          /*IsVolatile*/ true);
+      Builder.CreateBr(MergeBB);
+
+      // else
+      Builder.SetInsertPoint(ElseBB);
+      Builder.CreateBr(MergeBB);
+
+      // endif
+      Builder.SetInsertPoint(MergeBB);
+      OMPBuilder.createBarrier(
+          OpenMPIRBuilder::LocationDescription(Builder.saveIP(), Loc.DL),
+          omp::Directive::OMPD_unknown,
+          /* ForceSimpleCall */ false,
+          /* CheckCancelFlag */ true);
+
+      // Warp 0 copies reduce element from transfer medium
+      BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "w0then", WcFunc);
+      BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "w0else", WcFunc);
+      BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "w0ifcont", WcFunc);
+
+      Value *NumWarpsVal =
+          Builder.CreateLoad(I32Type, NumWarpsAddrCast, "num_warps");
+      Value *IsActiveThread =
+          Builder.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
+      Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
+
+      // W0then
+      // SecMEdiumPtr = &medium[tid]
+      Builder.SetInsertPoint(W0ThenBB);
+      Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
+          ArrayTy, TransferMedium, {Builder.getInt64(0), ThreadID});
+      // SrcMediumVal = *SrcMediumPtr
+      // TODO(JAN): Bitcast here, but no load? skipping for now
+      Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+          RedListArrayTy, ReduceListArg, 0, En.index());
+      Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+      if (NumIters > 1)
+        TargetElementPtr = Builder.CreateGEP(Int32Ty, TargetElementPtr, Cnt);
+
+      Value *SrcMediumValue =
+          Builder.CreateLoad(I32Type, SrcMediumPtrVal, /*IsVolatile*/ true);
+      Builder.CreateStore(SrcMediumValue, TargetElementPtr);
+      Builder.CreateBr(W0MergeBB);
+
+      // W0else
+      Builder.SetInsertPoint(W0ElseBB);
+      Builder.CreateBr(W0MergeBB);
+
+      // W0endif
+      Builder.SetInsertPoint(W0MergeBB);
+      if (NumIters > 1) {
+        Cnt = Builder.CreateNSWAdd(Cnt, Builder.getInt32(1));
+        Builder.CreateStore(Cnt, CntAddrAcast);
+        Builder.CreateBr(PrecondBB);
+        Builder.SetInsertPoint(ExitBB);
+      }
+    }
+  }
+
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return WcFunc;
+}
+
+/// This function emits a helper that copies all the reduction variables from
+/// the team into the provided global buffer for the reduction variables.
+///
+/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+///   For all data entries D in reduce_data:
+///     Copy local D to buffer.D[Idx]
+static Function *emitListToGlobalCopyFunction(
+    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+    OpenMPIRBuilder &OMPBuilder) {
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *Int32Ty = Builder.getInt32Ty();
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  auto FuncTy =
+      FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+  Function *LtGCFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_list_to_global_copy_func", &M);
+  LtGCFunc->setDoesNotRecurse();
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGCFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Set arg names
+  Argument *Arg0 = LtGCFunc->getArg(0);
+  Argument *Arg1 = LtGCFunc->getArg(1);
+  Argument *Arg2 = LtGCFunc->getArg(2);
+  Arg0->setName("buffer_arg");
+  Arg1->setName("idx_arg");
+  Arg2->setName("reduce_list_arg");
+
+  Value *BufferArgAlloca =
+      Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
+  Value *IdxArgAlloca =
+      Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
+  Value *ReduceListArgAlloca =
+      Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
+  // FIXME(JAN): Assume a single globalized variable for now, this should be
+  // passed in
+  Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
+  Type *TypeArgs[] = {SingleReductionTy};
+  StructType *ReductionsBufferTy =
+      StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
+
+  Builder.CreateStore(Arg0, BufferArgAddrCast);
+  Builder.CreateStore(Arg1, IdxArgAddrCast);
+  Builder.CreateStore(Arg2, ReduceListArgAddrCast);
+
+  Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
+  Value *Idxs[] = {
+      Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast, "idxs")};
+  Value *ReduceListArg =
+      Builder.CreateLoad(PtrTy, ReduceListArgAddrCast, "reduce_list");
+  // FIXME(Jan): Assume TEK_SCALAR
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    // FIXME(Jan): Compute array type
+    auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+    Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedListArrayTy, ReduceListArg, 0, En.index());
+    Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index());
+    Value *TargetElement = Builder.CreateLoad(RI.ElementType, TargetElementPtr);
+    Builder.CreateStore(TargetElement, GlobValPtr);
+  }
+
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGCFunc;
+}
+
+/// This function emits a helper that copies all the reduction variables from
+/// the team into the provided global buffer for the reduction variables.
+///
+/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+///   For all data entries D in reduce_data:
+///     Copy local D to buffer.D[Idx]
+static Function *emitGlobalToListCopyFunction(
+    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+    OpenMPIRBuilder &OMPBuilder) {
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *Int32Ty = Builder.getInt32Ty();
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  auto FuncTy =
+      FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+  Function *LtGCFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_global_to_list_copy_func", &M);
+  LtGCFunc->setDoesNotRecurse();
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGCFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Set arg names
+  Argument *Arg0 = LtGCFunc->getArg(0);
+  Argument *Arg1 = LtGCFunc->getArg(1);
+  Argument *Arg2 = LtGCFunc->getArg(2);
+  Arg0->setName("buffer_arg");
+  Arg1->setName("idx_arg");
+  Arg2->setName("reduce_list_arg");
+
+  Value *BufferArgAlloca =
+      Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
+  Value *IdxArgAlloca =
+      Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
+  Value *ReduceListArgAlloca =
+      Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
+  // FIXME(JAN): Assume a single globalized variable for now, this should be
+  // passed in
+  Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
+  Type *TypeArgs[] = {SingleReductionTy};
+  StructType *ReductionsBufferTy =
+      StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
+
+  Builder.CreateStore(Arg0, BufferArgAddrCast);
+  Builder.CreateStore(Arg1, IdxArgAddrCast);
+  Builder.CreateStore(Arg2, ReduceListArgAddrCast);
+
+  Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
+  Value *Idxs[] = {
+      Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast, "idxs")};
+  Value *ReduceListArg =
+      Builder.CreateLoad(PtrTy, ReduceListArgAddrCast, "reduce_list");
+  // FIXME(Jan): Assume TEK_SCALAR
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    // FIXME(Jan): Compute array type
+    auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+    Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedListArrayTy, ReduceListArg, 0, En.index());
+    Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index());
+    Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
+    Builder.CreateStore(TargetElement, TargetElementPtr);
+  }
+
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGCFunc;
+}
+
+/// This function emits a helper that reduces all the reduction variables from
+/// the team into the provided global buffer for the reduction variables.
+///
+/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
+///  void *GlobPtrs[];
+///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
+///  ...
+///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
+///  reduce_function(GlobPtrs, reduce_data);
+/// Create a function with a unique name and a "void (i8*, i8*)" signature in
+/// the given module and return it.
+static Function *emitListToGlobalReduceFunction(
+    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
+    OpenMPIRBuilder &OMPBuilder) {
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *Int32Ty = Builder.getInt32Ty();
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  auto FuncTy =
+      FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+  Function *LtGRFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_list_to_global_reduce_func", &M);
+  LtGRFunc->setDoesNotRecurse();
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGRFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Set arg names
+  Argument *Arg0 = LtGRFunc->getArg(0);
+  Argument *Arg1 = LtGRFunc->getArg(1);
+  Argument *Arg2 = LtGRFunc->getArg(2);
+  Arg0->setName("buffer_arg");
+  Arg1->setName("idx_arg");
+  Arg2->setName("reduce_list_arg");
+
+  Value *BufferArgAlloca =
+      Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
+  Value *IdxArgAlloca =
+      Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
+  Value *ReduceListArgAlloca =
+      Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+  // FIXME(Jan): Compute array type
+  auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+  Value *LocalReduceList =
+      Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
+  Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LocalReduceList, PtrTy, LocalReduceList->getName() + ".acast");
+  // FIXME(JAN): Assume a single globalized variable for now, this should be
+  // passed in
+  Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
+  Type *TypeArgs[] = {SingleReductionTy};
+  StructType *ReductionsBufferTy =
+      StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
+
+  Builder.CreateStore(Arg0, BufferArgAddrCast);
+  Builder.CreateStore(Arg1, IdxArgAddrCast);
+  Builder.CreateStore(Arg2, ReduceListArgAddrCast);
+
+  Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
+  Value *Idxs[] = {Builder.CreateLoad(Int32Ty, IdxArgAddrCast, "idxs")};
+  // FIXME(Jan): Assume TEK_SCALAR
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedListArrayTy, LocalReduceListAddrCast, 0, En.index());
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index());
+    Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+  }
+
+  Value *ReduceList = Builder.CreateLoad(PtrTy, ReduceListArgAddrCast);
+  Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList});
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGRFunc;
+}
+
+/// This function emits a helper that reduces all the reduction variables from
+/// the team into the provided global buffer for the reduction variables.
+///
+/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
+///  void *GlobPtrs[];
+///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
+///  ...
+///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
+///  reduce_function(GlobPtrs, reduce_data);
 /// Create a function with a unique name and a "void (i8*, i8*)" signature in
 /// the given module and return it.
-Function *getFreshReductionFunc(Module &M) {
+static Function *emitGlobalToListReduceFunction(
+    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
+    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
+    OpenMPIRBuilder &OMPBuilder) {
+  IRBuilder<> &Builder = OMPBuilder.Builder;
+  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+  LLVMContext &Ctx = M.getContext();
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *Int32Ty = Builder.getInt32Ty();
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  auto FuncTy =
+      FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+  Function *LtGRFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_global_to_list_reduce_func", &M);
+  LtGRFunc->setDoesNotRecurse();
+
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGRFunc);
+  Builder.SetInsertPoint(EntryBlock);
+
+  // Set arg names
+  Argument *Arg0 = LtGRFunc->getArg(0);
+  Argument *Arg1 = LtGRFunc->getArg(1);
+  Argument *Arg2 = LtGRFunc->getArg(2);
+  Arg0->setName("buffer_arg");
+  Arg1->setName("idx_arg");
+  Arg2->setName("reduce_list_arg");
+
+  Value *BufferArgAlloca =
+      Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
+  Value *IdxArgAlloca =
+      Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
+  Value *ReduceListArgAlloca =
+      Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+  // FIXME(Jan): Compute array type
+  auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+  Value *LocalReduceList =
+      Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
+  Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+  Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+  Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
+  Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LocalReduceList, PtrTy, LocalReduceList->getName() + ".acast");
+  // FIXME(JAN): Assume a single globalized variable for now, this should be
+  // passed in
+  Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
+  Type *TypeArgs[] = {SingleReductionTy};
+  StructType *ReductionsBufferTy =
+      StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
+
+  Builder.CreateStore(Arg0, BufferArgAddrCast);
+  Builder.CreateStore(Arg1, IdxArgAddrCast);
+  Builder.CreateStore(Arg2, ReduceListArgAddrCast);
+
+  Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
+  Value *Idxs[] = {Builder.CreateLoad(Int32Ty, IdxArgAddrCast, "idxs")};
+  // FIXME(Jan): Assume TEK_SCALAR
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedListArrayTy, LocalReduceListAddrCast, 0, En.index());
+    Value *BufferVD =
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index());
+    Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+  }
+
+  Value *ReduceList = Builder.CreateLoad(PtrTy, ReduceListArgAddrCast);
+  Builder.CreateCall(ReduceFn, {ReduceList, LocalReduceListAddrCast});
+  Builder.CreateRetVoid();
+  Builder.restoreIP(OldIP);
+  return LtGRFunc;
+}
+
+static Function *getFreshReductionFunc(Module &M) {
   Type *VoidTy = Type::getVoidTy(M.getContext());
   Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
   auto *FuncTy =
       FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
   return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
-                          M.getDataLayout().getDefaultGlobalsAddressSpace(),
                           ".omp.reduction.func", &M);
 }
 
-OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
-    const LocationDescription &Loc, InsertPointTy AllocaIP,
-    ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) {
-  for (const ReductionInfo &RI : ReductionInfos) {
+static void populateReductionFunction(
+    Function *ReductionFunc,
+    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+    IRBuilder<> &Builder, bool IsGPU) {
+  Module *Module = ReductionFunc->getParent();
+  BasicBlock *ReductionFuncBlock =
+      BasicBlock::Create(Module->getContext(), "", ReductionFunc);
+  Builder.SetInsertPoint(ReductionFuncBlock);
+  Value *LHSArrayPtr = nullptr;
+  Value *RHSArrayPtr = nullptr;
+  if (IsGPU) {
+    // Need to alloca memory here and deal with the pointers before getting
+    // LHS/RHS pointers out
+    //
+    Argument *Arg0 = ReductionFunc->getArg(0);
+    Argument *Arg1 = ReductionFunc->getArg(1);
+    Type *Arg0Type = Arg0->getType();
+    Type *Arg1Type = Arg1->getType();
+
+    Value *LHSAlloca =
+        Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+    Value *RHSAlloca =
+        Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+    Value *LHSAddrCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
+    Value *RHSAddrCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
+    Builder.CreateStore(Arg0, LHSAddrCast);
+    Builder.CreateStore(Arg1, RHSAddrCast);
+    LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
+    RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
+  } else {
+    LHSArrayPtr = ReductionFunc->getArg(0);
+    RHSArrayPtr = ReductionFunc->getArg(1);
+  }
+
+  unsigned NumReductions = ReductionInfos.size();
+  Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
+
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedArrayTy, LHSArrayPtr, 0, En.index());
+    Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+    Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        LHSI8Ptr, RI.Variable->getType());
+    Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+    Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedArrayTy, RHSArrayPtr, 0, En.index());
+    Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+    Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        RHSI8Ptr, RI.PrivateVariable->getType());
+    Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+    Value *Reduced;
+    Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
+    if (!Builder.GetInsertBlock())
+      return;
+    Builder.CreateStore(Reduced, LHSPtr);
+  }
+  Builder.CreateRetVoid();
+}
+
+static void
+checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+                    bool IsGPU) {
+  for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
     (void)RI;
     assert(RI.Variable && "expected non-null variable");
     assert(RI.PrivateVariable && "expected non-null private variable");
     assert(RI.ReductionGen && "expected non-null reduction generator callback");
-    assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
-           "expected variables and their private equivalents to have the same "
-           "type");
+    // JAN: Skip this assertion for GPU, address spaces are present
+    if (!IsGPU) {
+      assert(
+          RI.Variable->getType() == RI.PrivateVariable->getType() &&
+          "expected variables and their private equivalents to have the same "
+          "type");
+    }
     assert(RI.Variable->getType()->isPointerTy() &&
            "expected variables to be pointers");
   }
+}
 
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
+    const LocationDescription &Loc, InsertPointTy AllocaIP,
+    ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait,
+    bool IsTeamsReduction, bool HasDistribute) {
+  checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
+  LLVMContext &Ctx = M.getContext();
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
+  if (ReductionInfos.size() == 0)
+    return Builder.saveIP();
+
+  assert(ReductionInfos.size() == 1 && "More than one reduction variable");
+
+  // Copied code from createReductions
+  BasicBlock *InsertBlock = Loc.IP.getBlock();
+  BasicBlock *ContinuationBlock =
+      InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
+  InsertBlock->getTerminator()->eraseFromParent();
+  Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
+
+  Function *ReductionFunc = nullptr;
+  if (GLOBAL_ReductionFunc) {
+    ReductionFunc = GLOBAL_ReductionFunc;
+  } else {
+    ReductionFunc = getFreshReductionFunc(M);
+    GLOBAL_ReductionFunc = ReductionFunc;
+    InsertPointTy CurIP = Builder.saveIP();
+    populateReductionFunction(ReductionFunc, ReductionInfos, Builder, true);
+    Builder.restoreIP(CurIP);
+  }
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
+  Value *RTLoc =
+      getOrCreateIdent(SrcLocStr, SrcLocStrSize, llvm::omp::IdentFlag(0), 0);
+
+  // 1. Build a list of reduction variables
+  auto Size = ReductionInfos.size();
+  // FIXME(JAN): skipping variably modified type storage for array size
+  Type *PtrTy = PointerType::getUnqual(Ctx);
+  Type *RedArrayTy = ArrayType::get(PtrTy, Size);
+  InsertPointTy CurIP = Builder.saveIP();
+  Builder.restoreIP(AllocaIP);
+  Value *ReductionListAlloca =
+      Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
+  Value *ReductionList =
+      Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionListAlloca, PtrTy);
+  Builder.restoreIP(CurIP);
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *ElemPtr = Builder.CreateConstGEP2_64(RedArrayTy, ReductionList, 0,
+                                                En.index(), "elem_ptr");
+    Value *CastElem =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+    Builder.CreateStore(CastElem, ElemPtr);
+  }
+  CurIP = Builder.saveIP();
+  Function *SarFunc = emitShuffleAndReduceFunction(M, Loc, ReductionInfos,
+                                                   ReductionFunc, *this);
+  Function *WcFunc = emitInterWarpCopyFunction(M, Loc, ReductionInfos, *this);
+  Builder.restoreIP(CurIP);
+
+  Value *RL =
+    Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
+  Value *ReductionDataSize =
+      getTypeSizeInBytesValue(Builder, M, ReductionInfos.begin()->ElementType);
+
+  Value *Res;
+  if (!IsTeamsReduction) {
+    Value *SarFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
+    Value *WcFuncCast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, PtrTy);
+    Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
+    Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
+    Res = Builder.CreateCall(Pv2Ptr, Args);
+  } else {
+    CurIP = Builder.saveIP();
+    Function *LtGCFunc =
+        emitListToGlobalCopyFunction(M, Loc, ReductionInfos, *this);
+    Function *LtGRFunc = emitListToGlobalReduceFunction(M, Loc, ReductionInfos,
+                                                        ReductionFunc, *this);
+    Function *GtLCFunc =
+        emitGlobalToListCopyFunction(M, Loc, ReductionInfos, *this);
+    Function *GtLRFunc = emitGlobalToListReduceFunction(M, Loc, ReductionInfos,
+                                                        ReductionFunc, *this);
+    Builder.restoreIP(CurIP);
+
+    Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
+
+    Value *KernelTeamsReductionPtr = Builder.CreateCall(RedFixedBuferFn, {});
+
+    Value *Args3[] = {RTLoc,
+                      KernelTeamsReductionPtr,
+                      Builder.getInt32(1024),
+                      ReductionDataSize,
+                      RL,
+                      SarFunc,
+                      WcFunc,
+                      LtGCFunc,
+                      LtGRFunc,
+                      GtLCFunc,
+                      GtLRFunc};
+
+    Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
+        RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
+    Res = Builder.CreateCall(TeamsReduceFn, Args3);
+  }
+
+  if (IsTeamsReduction || !HasDistribute) {
+    Function *CurFunc = Builder.GetInsertBlock()->getParent();
+    BasicBlock *ExitBB =
+        BasicBlock::Create(Ctx, ".omp.reduction.done", CurFunc);
+    BasicBlock *ThenBB =
+        BasicBlock::Create(Ctx, ".omp.reduction.then", CurFunc);
+    Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
+    Builder.CreateCondBr(Cond, ThenBB, ExitBB);
+
+    Builder.SetInsertPoint(ThenBB);
+    for (auto En : enumerate(ReductionInfos)) {
+      const ReductionInfo &RI = En.value();
+      Value *InputVal = Builder.CreateLoad(RI.ElementType, RI.Variable);
+      Value *RedVal = Builder.CreateLoad(
+          RI.ElementType, Builder.CreatePointerBitCastOrAddrSpaceCast(
+                              RI.PrivateVariable, PtrTy));
+      Value *sum;
+      Builder.restoreIP(
+          RI.ReductionGen(Builder.saveIP(), InputVal, RedVal, sum));
+      Builder.CreateStore(sum, RI.Variable);
+      Builder.CreateBr(ExitBB);
+    }
+    Builder.SetInsertPoint(ExitBB);
+  }
+  Builder.CreateBr(ContinuationBlock);
+  Builder.SetInsertPoint(ContinuationBlock);
+  return Builder.saveIP();
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
+    const LocationDescription &Loc, InsertPointTy AllocaIP,
+    ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait,
+    bool IsTeamsReduction, bool HasDistribute) {
+  if (Config.isGPU())
+    return createReductionsGPU(Loc, AllocaIP, ReductionInfos, IsNoWait,
+                               IsTeamsReduction, HasDistribute);
+
+  checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
+
+  if (!updateToLocation(Loc))
+    return InsertPointTy();
+
+  if (ReductionInfos.size() == 0)
+    return Builder.saveIP();
+
   BasicBlock *InsertBlock = Loc.IP.getBlock();
   BasicBlock *ContinuationBlock =
       InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
@@ -2189,33 +3355,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   // Populate the outlined reduction function using the elementwise reduction
   // function. Partial values are extracted from the type-erased array of
   // pointers to private variables.
-  BasicBlock *ReductionFuncBlock =
-      BasicBlock::Create(Module->getContext(), "", ReductionFunc);
-  Builder.SetInsertPoint(ReductionFuncBlock);
-  Value *LHSArrayPtr = ReductionFunc->getArg(0);
-  Value *RHSArrayPtr = ReductionFunc->getArg(1);
-
-  for (auto En : enumerate(ReductionInfos)) {
-    const ReductionInfo &RI = En.value();
-    Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
-        RedArrayTy, LHSArrayPtr, 0, En.index());
-    Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
-    Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
-    Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
-    Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
-        RedArrayTy, RHSArrayPtr, 0, En.index());
-    Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
-    Value *RHSPtr =
-        Builder.CreateBitCast(RHSI8Ptr, RI.PrivateVariable->getType());
-    Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
-    Value *Reduced;
-    Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
-    if (!Builder.GetInsertBlock())
-      return InsertPointTy();
-    Builder.CreateStore(Reduced, LHSPtr);
-  }
-  Builder.CreateRetVoid();
-
+  populateReductionFunction(ReductionFunc, ReductionInfos, Builder, false);
   Builder.SetInsertPoint(ContinuationBlock);
   return Builder.saveIP();
 }
@@ -2728,10 +3868,24 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
 static void createTargetLoopWorkshareCall(
     OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
     BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
-    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
-  Type *TripCountTy = TripCount->getType();
+    Type *ParallelTaskPtr, Value *TripCountOrig, Function &LoopBodyFn) {
   Module &M = OMPBuilder->M;
   IRBuilder<> &Builder = OMPBuilder->Builder;
+  Value *TripCount = TripCountOrig;
+  // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
+  // not be the right way to fix it, but this works for now.
+  if (OMPBuilder->Config.isGPU()) {
+    if (LoopType != WorksharingLoopType::DistributeStaticLoop)
+      Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
+    LLVMContext &Ctx = M.getContext();
+    Type *IVTy = TripCountOrig->getType();
+    Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32
+                             ? Type::getInt32Ty(Ctx)
+                             : Type::getInt64Ty(Ctx);
+    Constant *One = ConstantInt::get(InternalIVTy, 1);
+    TripCount = Builder.CreateSub(TripCountOrig, One, "modified_trip_count");
+  }
+  Type *TripCountTy = TripCount->getType();
   FunctionCallee RTLFn =
       getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
   SmallVector<Value *, 8> RealArgs;
@@ -4469,6 +5623,9 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
 
   Function *Kernel = Builder.GetInsertBlock()->getParent();
 
+  // Set the grid value in the config needed for lowering later on
+  Config.setGridValue(getGridValue(T, Config.TargetFeatures));
+
   // Manifest the launch configuration in the metadata matching the kernel
   // environment.
   if (MinTeamsVal > 1 || MaxTeamsVal > 0)
@@ -4721,6 +5878,10 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
     OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
     if (T.isAMDGCN())
       OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
+    if (!Config.TargetCPU.empty())
+      OutlinedFn->addFnAttr("target-cpu", Config.TargetCPU);
+    if (!Config.TargetFeatures.empty())
+      OutlinedFn->addFnAttr("target-features", Config.TargetFeatures);
   }
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index c87e895bb5404c..ed7c3b29f88af5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2905,6 +2905,17 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation(
             }
             return failure();
           })
+      .Case("omp.target",
+            [&](Attribute attr) {
+              if (auto targetAttr = attr.dyn_cast<omp::TargetAttr>()) {
+                llvm::OpenMPIRBuilderConfig &config =
+                    moduleTranslation.getOpenMPBuilder()->Config;
+                config.TargetCPU = targetAttr.getTargetCpu();
+                config.TargetFeatures = targetAttr.getTargetFeatures();
+                return success();
+              }
+              return failure();
+            })
       .Default([](Attribute) {
         // Fall through for omp attributes that do not require lowering.
         return success();

>From 3807846d715dd1bbe097c928978766aaa60f7fed Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 31 Jan 2024 20:05:04 +0000
Subject: [PATCH 02/10] Clang migrate to using OMPBuilder.

---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      | 1374 +-----------
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.h        |    3 -
 ...arallel_reduction_codegen_tbaa_PR46146.cpp |  546 ++---
 .../OpenMP/nvptx_teams_reduction_codegen.cpp  |   24 +-
 .../target_teams_generic_loop_codegen.cpp     |   12 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  623 +++++-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 1851 +++++++++--------
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |   11 +-
 8 files changed, 1959 insertions(+), 2485 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 299ee1460b3db0..7cddf73306f2d9 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -501,31 +501,6 @@ class CheckVarsEscapingDeclContext final
 };
 } // anonymous namespace
 
-/// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
-}
-
-/// Get the id of the current lane in the Warp.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
-  assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
-  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
-                       "nvptx_lane_id");
-}
-
 CGOpenMPRuntimeGPU::ExecutionMode
 CGOpenMPRuntimeGPU::getExecutionMode() const {
   return CurrentExecutionMode;
@@ -1429,1132 +1404,6 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
                               TBAAAccessInfo());
 }
 
-/// This function creates calls to one of two shuffle functions to copy
-/// variables between lanes in a warp.
-static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
-                                                 llvm::Value *Elem,
-                                                 QualType ElemType,
-                                                 llvm::Value *Offset,
-                                                 SourceLocation Loc) {
-  CodeGenModule &CGM = CGF.CGM;
-  CGBuilderTy &Bld = CGF.Builder;
-  CGOpenMPRuntimeGPU &RT =
-      *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
-  llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
-
-  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
-  assert(Size.getQuantity() <= 8 &&
-         "Unsupported bitwidth in shuffle instruction.");
-
-  RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
-                                  ? OMPRTL___kmpc_shuffle_int32
-                                  : OMPRTL___kmpc_shuffle_int64;
-
-  // Cast all types to 32- or 64-bit values before calling shuffle routines.
-  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
-      Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
-  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
-  llvm::Value *WarpSize =
-      Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
-
-  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
-      OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
-      {ElemCast, Offset, WarpSize});
-
-  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
-}
-
-static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
-                            Address DestAddr, QualType ElemType,
-                            llvm::Value *Offset, SourceLocation Loc) {
-  CGBuilderTy &Bld = CGF.Builder;
-
-  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
-  // Create the loop over the big sized data.
-  // ptr = (void*)Elem;
-  // ptrEnd = (void*) Elem + 1;
-  // Step = 8;
-  // while (ptr + Step < ptrEnd)
-  //   shuffle((int64_t)*ptr);
-  // Step = 4;
-  // while (ptr + Step < ptrEnd)
-  //   shuffle((int32_t)*ptr);
-  // ...
-  Address ElemPtr = DestAddr;
-  Address Ptr = SrcAddr;
-  Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
-  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
-    if (Size < CharUnits::fromQuantity(IntSize))
-      continue;
-    QualType IntType = CGF.getContext().getIntTypeForBitwidth(
-        CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
-        /*Signed=*/1);
-    llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
-    Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
-                                                  IntTy);
-    ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtr, IntTy->getPointerTo(), IntTy);
-    if (Size.getQuantity() / IntSize > 1) {
-      llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
-      llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
-      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
-      llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
-      CGF.EmitBlock(PreCondBB);
-      llvm::PHINode *PhiSrc =
-          Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
-      PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
-      llvm::PHINode *PhiDest =
-          Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
-      PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
-      Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
-      ElemPtr =
-          Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
-      llvm::Value *PtrDiff = Bld.CreatePtrDiff(
-          CGF.Int8Ty, PtrEnd.getPointer(),
-          Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
-                                                  CGF.VoidPtrTy));
-      Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
-                       ThenBB, ExitBB);
-      CGF.EmitBlock(ThenBB);
-      llvm::Value *Res = createRuntimeShuffleFunction(
-          CGF,
-          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
-                               LValueBaseInfo(AlignmentSource::Type),
-                               TBAAAccessInfo()),
-          IntType, Offset, Loc);
-      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-      Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
-      Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
-      PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
-      PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
-      CGF.EmitBranch(PreCondBB);
-      CGF.EmitBlock(ExitBB);
-    } else {
-      llvm::Value *Res = createRuntimeShuffleFunction(
-          CGF,
-          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
-                               LValueBaseInfo(AlignmentSource::Type),
-                               TBAAAccessInfo()),
-          IntType, Offset, Loc);
-      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-      Ptr = Bld.CreateConstGEP(Ptr, 1);
-      ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
-    }
-    Size = Size % IntSize;
-  }
-}
-
-namespace {
-enum CopyAction : unsigned {
-  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
-  // the warp using shuffle instructions.
-  RemoteLaneToThread,
-  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
-  ThreadCopy,
-};
-} // namespace
-
-struct CopyOptionsTy {
-  llvm::Value *RemoteLaneOffset;
-  llvm::Value *ScratchpadIndex;
-  llvm::Value *ScratchpadWidth;
-};
-
-/// Emit instructions to copy a Reduce list, which contains partially
-/// aggregated values, in the specified direction.
-static void emitReductionListCopy(
-    CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
-    ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
-    CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
-
-  CodeGenModule &CGM = CGF.CGM;
-  ASTContext &C = CGM.getContext();
-  CGBuilderTy &Bld = CGF.Builder;
-
-  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
-
-  // Iterates, element-by-element, through the source Reduce list and
-  // make a copy.
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    Address SrcElementAddr = Address::invalid();
-    Address DestElementAddr = Address::invalid();
-    Address DestElementPtrAddr = Address::invalid();
-    // Should we shuffle in an element from a remote lane?
-    bool ShuffleInElement = false;
-    // Set to true to update the pointer in the dest Reduce list to a
-    // newly created element.
-    bool UpdateDestListPtr = false;
-    QualType PrivatePtrType = C.getPointerType(Private->getType());
-    llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
-
-    switch (Action) {
-    case RemoteLaneToThread: {
-      // Step 1.1: Get the address for the src element in the Reduce list.
-      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
-      SrcElementAddr = CGF.EmitLoadOfPointer(
-          SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
-          PrivatePtrType->castAs<PointerType>());
-
-      // Step 1.2: Create a temporary to store the element in the destination
-      // Reduce list.
-      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
-      DestElementAddr =
-          CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
-      ShuffleInElement = true;
-      UpdateDestListPtr = true;
-      break;
-    }
-    case ThreadCopy: {
-      // Step 1.1: Get the address for the src element in the Reduce list.
-      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
-      SrcElementAddr = CGF.EmitLoadOfPointer(
-          SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
-          PrivatePtrType->castAs<PointerType>());
-
-      // Step 1.2: Get the address for dest element.  The destination
-      // element has already been created on the thread's stack.
-      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
-      DestElementAddr = CGF.EmitLoadOfPointer(
-          DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
-          PrivatePtrType->castAs<PointerType>());
-      break;
-    }
-    }
-
-    // Regardless of src and dest of copy, we emit the load of src
-    // element as this is required in all directions
-    SrcElementAddr = SrcElementAddr.withElementType(
-        CGF.ConvertTypeForMem(Private->getType()));
-    DestElementAddr =
-        DestElementAddr.withElementType(SrcElementAddr.getElementType());
-
-    // Now that all active lanes have read the element in the
-    // Reduce list, shuffle over the value from the remote lane.
-    if (ShuffleInElement) {
-      shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
-                      RemoteLaneOffset, Private->getExprLoc());
-    } else {
-      switch (CGF.getEvaluationKind(Private->getType())) {
-      case TEK_Scalar: {
-        llvm::Value *Elem = CGF.EmitLoadOfScalar(
-            SrcElementAddr, /*Volatile=*/false, Private->getType(),
-            Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
-            TBAAAccessInfo());
-        // Store the source element value to the dest element address.
-        CGF.EmitStoreOfScalar(
-            Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
-            LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
-        break;
-      }
-      case TEK_Complex: {
-        CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
-            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
-            Private->getExprLoc());
-        CGF.EmitStoreOfComplex(
-            Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
-            /*isInit=*/false);
-        break;
-      }
-      case TEK_Aggregate:
-        CGF.EmitAggregateCopy(
-            CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
-            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
-            Private->getType(), AggValueSlot::DoesNotOverlap);
-        break;
-      }
-    }
-
-    // Step 3.1: Modify reference in dest Reduce list as needed.
-    // Modifying the reference in Reduce list to point to the newly
-    // created element.  The element is live in the current function
-    // scope and that of functions it invokes (i.e., reduce_function).
-    // RemoteReduceData[i] = (void*)&RemoteElem
-    if (UpdateDestListPtr) {
-      CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
-                                DestElementAddr.getPointer(), CGF.VoidPtrTy),
-                            DestElementPtrAddr, /*Volatile=*/false,
-                            C.VoidPtrTy);
-    }
-
-    ++Idx;
-  }
-}
-
-/// This function emits a helper that gathers Reduce lists from the first
-/// lane of every active warp to lanes in the first warp.
-///
-/// void inter_warp_copy_func(void* reduce_data, num_warps)
-///   shared smem[warp_size];
-///   For all data entries D in reduce_data:
-///     sync
-///     If (I am the first lane in each warp)
-///       Copy my local D to smem[warp_id]
-///     sync
-///     if (I am the first warp)
-///       Copy smem[thread_id] to my local D
-static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
-                                              ArrayRef<const Expr *> Privates,
-                                              QualType ReductionArrayTy,
-                                              SourceLocation Loc) {
-  ASTContext &C = CGM.getContext();
-  llvm::Module &M = CGM.getModule();
-
-  // ReduceList: thread local Reduce list.
-  // At the stage of the computation when this function is called, partially
-  // aggregated values reside in the first lane of every active warp.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  // NumWarps: number of warps active in the parallel region.  This could
-  // be smaller than 32 (max warps in a CTA) for partial block reduction.
-  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                C.getIntTypeForBitwidth(32, /* Signed */ true),
-                                ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&ReduceListArg);
-  Args.push_back(&NumWarpsArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
-                                    llvm::GlobalValue::InternalLinkage,
-                                    "_omp_reduction_inter_warp_copy_func", &M);
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  // This array is used as a medium to transfer, one reduce element at a time,
-  // the data from the first lane of every warp to lanes in the first warp
-  // in order to perform the final step of a reduction in a parallel region
-  // (reduction across warps).  The array is placed in NVPTX __shared__ memory
-  // for reduced latency, as well as to have a distinct copy for concurrently
-  // executing target regions.  The array is declared with common linkage so
-  // as to be shared across compilation units.
-  StringRef TransferMediumName =
-      "__openmp_nvptx_data_transfer_temporary_storage";
-  llvm::GlobalVariable *TransferMedium =
-      M.getGlobalVariable(TransferMediumName);
-  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
-  if (!TransferMedium) {
-    auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
-    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
-    TransferMedium = new llvm::GlobalVariable(
-        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
-        llvm::UndefValue::get(Ty), TransferMediumName,
-        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
-    CGM.addCompilerUsedGlobal(TransferMedium);
-  }
-
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  // Get the CUDA thread id of the current OpenMP thread on the GPU.
-  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
-  // nvptx_lane_id = nvptx_id % warpsize
-  llvm::Value *LaneID = getNVPTXLaneID(CGF);
-  // nvptx_warp_id = nvptx_id / warpsize
-  llvm::Value *WarpID = getNVPTXWarpID(CGF);
-
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
-  Address LocalReduceList(
-      Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(
-              AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
-              LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
-          ElemTy->getPointerTo()),
-      ElemTy, CGF.getPointerAlign());
-
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    //
-    // Warp master copies reduce element to transfer medium in __shared__
-    // memory.
-    //
-    unsigned RealTySize =
-        C.getTypeSizeInChars(Private->getType())
-            .alignTo(C.getTypeAlignInChars(Private->getType()))
-            .getQuantity();
-    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
-      unsigned NumIters = RealTySize / TySize;
-      if (NumIters == 0)
-        continue;
-      QualType CType = C.getIntTypeForBitwidth(
-          C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
-      llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
-      CharUnits Align = CharUnits::fromQuantity(TySize);
-      llvm::Value *Cnt = nullptr;
-      Address CntAddr = Address::invalid();
-      llvm::BasicBlock *PrecondBB = nullptr;
-      llvm::BasicBlock *ExitBB = nullptr;
-      if (NumIters > 1) {
-        CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
-        CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
-                              /*Volatile=*/false, C.IntTy);
-        PrecondBB = CGF.createBasicBlock("precond");
-        ExitBB = CGF.createBasicBlock("exit");
-        llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
-        // There is no need to emit line number for unconditional branch.
-        (void)ApplyDebugLocation::CreateEmpty(CGF);
-        CGF.EmitBlock(PrecondBB);
-        Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
-        llvm::Value *Cmp =
-            Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
-        Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
-        CGF.EmitBlock(BodyBB);
-      }
-      // kmpc_barrier.
-      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
-                                             /*EmitChecks=*/false,
-                                             /*ForceSimpleCall=*/true);
-      llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
-      llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
-      llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
-      // if (lane_id == 0)
-      llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
-      Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
-      CGF.EmitBlock(ThenBB);
-
-      // Reduce element = LocalReduceList[i]
-      Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
-      llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
-          ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-      // elemptr = ((CopyType*)(elemptrptr)) + I
-      Address ElemPtr(ElemPtrPtr, CopyType, Align);
-      if (NumIters > 1)
-        ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
-
-      // Get pointer to location in transfer medium.
-      // MediumPtr = &medium[warp_id]
-      llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
-          TransferMedium->getValueType(), TransferMedium,
-          {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
-      // Casting to actual data type.
-      // MediumPtr = (CopyType*)MediumPtrAddr;
-      Address MediumPtr(MediumPtrVal, CopyType, Align);
-
-      // elem = *elemptr
-      //*MediumPtr = elem
-      llvm::Value *Elem = CGF.EmitLoadOfScalar(
-          ElemPtr, /*Volatile=*/false, CType, Loc,
-          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
-      // Store the source element value to the dest element address.
-      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-
-      Bld.CreateBr(MergeBB);
-
-      CGF.EmitBlock(ElseBB);
-      Bld.CreateBr(MergeBB);
-
-      CGF.EmitBlock(MergeBB);
-
-      // kmpc_barrier.
-      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
-                                             /*EmitChecks=*/false,
-                                             /*ForceSimpleCall=*/true);
-
-      //
-      // Warp 0 copies reduce element from transfer medium.
-      //
-      llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
-      llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
-      llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
-
-      Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
-      llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
-          AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
-
-      // Up to 32 threads in warp 0 are active.
-      llvm::Value *IsActiveThread =
-          Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
-      Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
-
-      CGF.EmitBlock(W0ThenBB);
-
-      // SrcMediumPtr = &medium[tid]
-      llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
-          TransferMedium->getValueType(), TransferMedium,
-          {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
-      // SrcMediumVal = *SrcMediumPtr;
-      Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
-
-      // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
-      Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
-      llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
-          TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
-      Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
-      if (NumIters > 1)
-        TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
-
-      // *TargetElemPtr = SrcMediumVal;
-      llvm::Value *SrcMediumValue =
-          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
-      CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
-                            CType);
-      Bld.CreateBr(W0MergeBB);
-
-      CGF.EmitBlock(W0ElseBB);
-      Bld.CreateBr(W0MergeBB);
-
-      CGF.EmitBlock(W0MergeBB);
-
-      if (NumIters > 1) {
-        Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
-        CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
-        CGF.EmitBranch(PrecondBB);
-        (void)ApplyDebugLocation::CreateEmpty(CGF);
-        CGF.EmitBlock(ExitBB);
-      }
-      RealTySize %= TySize;
-    }
-    ++Idx;
-  }
-
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// Emit a helper that reduces data across two OpenMP threads (lanes)
-/// in the same warp.  It uses shuffle instructions to copy over data from
-/// a remote lane's stack.  The reduction algorithm performed is specified
-/// by the fourth parameter.
-///
-/// Algorithm Versions.
-/// Full Warp Reduce (argument value 0):
-///   This algorithm assumes that all 32 lanes are active and gathers
-///   data from these 32 lanes, producing a single resultant value.
-/// Contiguous Partial Warp Reduce (argument value 1):
-///   This algorithm assumes that only a *contiguous* subset of lanes
-///   are active.  This happens for the last warp in a parallel region
-///   when the user specified num_threads is not an integer multiple of
-///   32.  This contiguous subset always starts with the zeroth lane.
-/// Partial Warp Reduce (argument value 2):
-///   This algorithm gathers data from any number of lanes at any position.
-/// All reduced values are stored in the lowest possible lane.  The set
-/// of problems every algorithm addresses is a super set of those
-/// addressable by algorithms with a lower version number.  Overhead
-/// increases as algorithm version increases.
-///
-/// Terminology
-/// Reduce element:
-///   Reduce element refers to the individual data field with primitive
-///   data types to be combined and reduced across threads.
-/// Reduce list:
-///   Reduce list refers to a collection of local, thread-private
-///   reduce elements.
-/// Remote Reduce list:
-///   Remote Reduce list refers to a collection of remote (relative to
-///   the current thread) reduce elements.
-///
-/// We distinguish between three states of threads that are important to
-/// the implementation of this function.
-/// Alive threads:
-///   Threads in a warp executing the SIMT instruction, as distinguished from
-///   threads that are inactive due to divergent control flow.
-/// Active threads:
-///   The minimal set of threads that has to be alive upon entry to this
-///   function.  The computation is correct iff active threads are alive.
-///   Some threads are alive but they are not active because they do not
-///   contribute to the computation in any useful manner.  Turning them off
-///   may introduce control flow overheads without any tangible benefits.
-/// Effective threads:
-///   In order to comply with the argument requirements of the shuffle
-///   function, we must keep all lanes holding data alive.  But at most
-///   half of them perform value aggregation; we refer to this half of
-///   threads as effective. The other half is simply handing off their
-///   data.
-///
-/// Procedure
-/// Value shuffle:
-///   In this step active threads transfer data from higher lane positions
-///   in the warp to lower lane positions, creating Remote Reduce list.
-/// Value aggregation:
-///   In this step, effective threads combine their thread local Reduce list
-///   with Remote Reduce list and store the result in the thread local
-///   Reduce list.
-/// Value copy:
-///   In this step, we deal with the assumption made by algorithm 2
-///   (i.e. contiguity assumption).  When we have an odd number of lanes
-///   active, say 2k+1, only k threads will be effective and therefore k
-///   new values will be produced.  However, the Reduce list owned by the
-///   (2k+1)th thread is ignored in the value aggregation.  Therefore
-///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
-///   that the contiguity assumption still holds.
-static llvm::Function *emitShuffleAndReduceFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
-  ASTContext &C = CGM.getContext();
-
-  // Thread local Reduce list used to host the values of data to be reduced.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  // Current lane id; could be logical.
-  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
-                              ImplicitParamKind::Other);
-  // Offset of the remote source lane relative to the current lane.
-  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                        C.ShortTy, ImplicitParamKind::Other);
-  // Algorithm version.  This is expected to be known at compile time.
-  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                               C.ShortTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&ReduceListArg);
-  Args.push_back(&LaneIDArg);
-  Args.push_back(&RemoteLaneOffsetArg);
-  Args.push_back(&AlgoVerArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
-  Address LocalReduceList(
-      Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, SourceLocation()),
-          ElemTy->getPointerTo()),
-      ElemTy, CGF.getPointerAlign());
-
-  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
-  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
-      AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
-  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
-  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
-      AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
-  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
-  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
-      AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
-
-  // Create a local thread-private variable to host the Reduce list
-  // from a remote lane.
-  Address RemoteReduceList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
-
-  // This loop iterates through the list of reduce elements and copies,
-  // element by element, from a remote lane in the warp to RemoteReduceList,
-  // hosted on the thread's stack.
-  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
-                        LocalReduceList, RemoteReduceList,
-                        {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
-                         /*ScratchpadIndex=*/nullptr,
-                         /*ScratchpadWidth=*/nullptr});
-
-  // The actions to be performed on the Remote Reduce list is dependent
-  // on the algorithm version.
-  //
-  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
-  //  LaneId % 2 == 0 && Offset > 0):
-  //    do the reduction value aggregation
-  //
-  //  The thread local variable Reduce list is mutated in place to host the
-  //  reduced data, which is the aggregated value produced from local and
-  //  remote lanes.
-  //
-  //  Note that AlgoVer is expected to be a constant integer known at compile
-  //  time.
-  //  When AlgoVer==0, the first conjunction evaluates to true, making
-  //    the entire predicate true during compile time.
-  //  When AlgoVer==1, the second conjunction has only the second part to be
-  //    evaluated during runtime.  Other conjunctions evaluates to false
-  //    during compile time.
-  //  When AlgoVer==2, the third conjunction has only the second part to be
-  //    evaluated during runtime.  Other conjunctions evaluates to false
-  //    during compile time.
-  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
-
-  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
-  llvm::Value *CondAlgo1 = Bld.CreateAnd(
-      Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
-
-  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
-  llvm::Value *CondAlgo2 = Bld.CreateAnd(
-      Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
-  CondAlgo2 = Bld.CreateAnd(
-      CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
-
-  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
-  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
-
-  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
-  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
-  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
-
-  CGF.EmitBlock(ThenBB);
-  // reduce_function(LocalReduceList, RemoteReduceList)
-  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      LocalReduceList.getPointer(), CGF.VoidPtrTy);
-  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      RemoteReduceList.getPointer(), CGF.VoidPtrTy);
-  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
-      CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
-  Bld.CreateBr(MergeBB);
-
-  CGF.EmitBlock(ElseBB);
-  Bld.CreateBr(MergeBB);
-
-  CGF.EmitBlock(MergeBB);
-
-  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
-  // Reduce list.
-  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
-  llvm::Value *CondCopy = Bld.CreateAnd(
-      Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
-
-  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
-  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
-  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
-  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
-
-  CGF.EmitBlock(CpyThenBB);
-  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
-                        RemoteReduceList, LocalReduceList);
-  Bld.CreateBr(CpyMergeBB);
-
-  CGF.EmitBlock(CpyElseBB);
-  Bld.CreateBr(CpyMergeBB);
-
-  CGF.EmitBlock(CpyMergeBB);
-
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-///   For all data entries D in reduce_data:
-///     Copy local D to buffer.D[Idx]
-static llvm::Value *emitListToGlobalCopyFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, SourceLocation Loc,
-    const RecordDecl *TeamReductionRec,
-    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
-        &VarFieldMap) {
-  ASTContext &C = CGM.getContext();
-
-  // Buffer: global reduction buffer.
-  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                              C.VoidPtrTy, ImplicitParamKind::Other);
-  // Idx: index of the buffer.
-  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
-                           ImplicitParamKind::Other);
-  // ReduceList: thread local Reduce list.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&BufferArg);
-  Args.push_back(&IdxArg);
-  Args.push_back(&ReduceListArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
-  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
-  Address LocalReduceList(
-      Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, Loc),
-          ElemTy->getPointerTo()),
-      ElemTy, CGF.getPointerAlign());
-  QualType StaticTy = C.getRecordType(TeamReductionRec);
-  llvm::Type *LLVMReductionsBufferTy =
-      CGM.getTypes().ConvertTypeForMem(StaticTy);
-  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
-      LLVMReductionsBufferTy->getPointerTo());
-  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
-                                              /*Volatile=*/false, C.IntTy,
-                                              Loc)};
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    // Reduce element = LocalReduceList[i]
-    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
-    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
-        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-    // elemptr = ((CopyType*)(elemptrptr)) + I
-    ElemTy = CGF.ConvertTypeForMem(Private->getType());
-    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtrPtr, ElemTy->getPointerTo());
-    Address ElemPtr =
-        Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
-    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
-    // Global = Buffer.VD[Idx];
-    const FieldDecl *FD = VarFieldMap.lookup(VD);
-    llvm::Value *BufferPtr =
-        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
-    LValue GlobLVal = CGF.EmitLValueForField(
-        CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
-    Address GlobAddr = GlobLVal.getAddress(CGF);
-    GlobLVal.setAddress(Address(GlobAddr.getPointer(),
-                                CGF.ConvertTypeForMem(Private->getType()),
-                                GlobAddr.getAlignment()));
-    switch (CGF.getEvaluationKind(Private->getType())) {
-    case TEK_Scalar: {
-      llvm::Value *V = CGF.EmitLoadOfScalar(
-          ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
-          LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
-      CGF.EmitStoreOfScalar(V, GlobLVal);
-      break;
-    }
-    case TEK_Complex: {
-      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
-          CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
-      CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
-      break;
-    }
-    case TEK_Aggregate:
-      CGF.EmitAggregateCopy(GlobLVal,
-                            CGF.MakeAddrLValue(ElemPtr, Private->getType()),
-                            Private->getType(), AggValueSlot::DoesNotOverlap);
-      break;
-    }
-    ++Idx;
-  }
-
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
-///  void *GlobPtrs[];
-///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
-///  ...
-///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
-///  reduce_function(GlobPtrs, reduce_data);
-static llvm::Value *emitListToGlobalReduceFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, SourceLocation Loc,
-    const RecordDecl *TeamReductionRec,
-    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
-        &VarFieldMap,
-    llvm::Function *ReduceFn) {
-  ASTContext &C = CGM.getContext();
-
-  // Buffer: global reduction buffer.
-  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                              C.VoidPtrTy, ImplicitParamKind::Other);
-  // Idx: index of the buffer.
-  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
-                           ImplicitParamKind::Other);
-  // ReduceList: thread local Reduce list.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&BufferArg);
-  Args.push_back(&IdxArg);
-  Args.push_back(&ReduceListArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
-  QualType StaticTy = C.getRecordType(TeamReductionRec);
-  llvm::Type *LLVMReductionsBufferTy =
-      CGM.getTypes().ConvertTypeForMem(StaticTy);
-  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
-      LLVMReductionsBufferTy->getPointerTo());
-
-  // 1. Build a list of reduction variables.
-  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
-  Address ReductionList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
-  auto IPriv = Privates.begin();
-  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
-                                              /*Volatile=*/false, C.IntTy,
-                                              Loc)};
-  unsigned Idx = 0;
-  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
-    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-    // Global = Buffer.VD[Idx];
-    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
-    const FieldDecl *FD = VarFieldMap.lookup(VD);
-    llvm::Value *BufferPtr =
-        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
-    LValue GlobLVal = CGF.EmitLValueForField(
-        CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
-    Address GlobAddr = GlobLVal.getAddress(CGF);
-    CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false,
-                          C.VoidPtrTy);
-    if ((*IPriv)->getType()->isVariablyModifiedType()) {
-      // Store array size.
-      ++Idx;
-      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-      llvm::Value *Size = CGF.Builder.CreateIntCast(
-          CGF.getVLASize(
-                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
-              .NumElts,
-          CGF.SizeTy, /*isSigned=*/false);
-      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
-                              Elem);
-    }
-  }
-
-  // Call reduce_function(GlobalReduceList, ReduceList)
-  llvm::Value *GlobalReduceList = ReductionList.getPointer();
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
-      AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
-  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
-      CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-///   For all data entries D in reduce_data:
-///     Copy buffer.D[Idx] to local D;
-static llvm::Value *emitGlobalToListCopyFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, SourceLocation Loc,
-    const RecordDecl *TeamReductionRec,
-    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
-        &VarFieldMap) {
-  ASTContext &C = CGM.getContext();
-
-  // Buffer: global reduction buffer.
-  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                              C.VoidPtrTy, ImplicitParamKind::Other);
-  // Idx: index of the buffer.
-  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
-                           ImplicitParamKind::Other);
-  // ReduceList: thread local Reduce list.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&BufferArg);
-  Args.push_back(&IdxArg);
-  Args.push_back(&ReduceListArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
-  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
-  Address LocalReduceList(
-      Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
-                               C.VoidPtrTy, Loc),
-          ElemTy->getPointerTo()),
-      ElemTy, CGF.getPointerAlign());
-  QualType StaticTy = C.getRecordType(TeamReductionRec);
-  llvm::Type *LLVMReductionsBufferTy =
-      CGM.getTypes().ConvertTypeForMem(StaticTy);
-  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
-      LLVMReductionsBufferTy->getPointerTo());
-
-  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
-                                              /*Volatile=*/false, C.IntTy,
-                                              Loc)};
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    // Reduce element = LocalReduceList[i]
-    Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
-    llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
-        ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-    // elemptr = ((CopyType*)(elemptrptr)) + I
-    ElemTy = CGF.ConvertTypeForMem(Private->getType());
-    ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtrPtr, ElemTy->getPointerTo());
-    Address ElemPtr =
-        Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
-    const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
-    // Global = Buffer.VD[Idx];
-    const FieldDecl *FD = VarFieldMap.lookup(VD);
-    llvm::Value *BufferPtr =
-        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
-    LValue GlobLVal = CGF.EmitLValueForField(
-        CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
-    Address GlobAddr = GlobLVal.getAddress(CGF);
-    GlobLVal.setAddress(Address(GlobAddr.getPointer(),
-                                CGF.ConvertTypeForMem(Private->getType()),
-                                GlobAddr.getAlignment()));
-    switch (CGF.getEvaluationKind(Private->getType())) {
-    case TEK_Scalar: {
-      llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
-      CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-      break;
-    }
-    case TEK_Complex: {
-      CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
-      CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
-                             /*isInit=*/false);
-      break;
-    }
-    case TEK_Aggregate:
-      CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
-                            GlobLVal, Private->getType(),
-                            AggValueSlot::DoesNotOverlap);
-      break;
-    }
-    ++Idx;
-  }
-
-  CGF.FinishFunction();
-  return Fn;
-}
-
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
-///  void *GlobPtrs[];
-///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
-///  ...
-///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
-///  reduce_function(reduce_data, GlobPtrs);
-static llvm::Value *emitGlobalToListReduceFunction(
-    CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
-    QualType ReductionArrayTy, SourceLocation Loc,
-    const RecordDecl *TeamReductionRec,
-    const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
-        &VarFieldMap,
-    llvm::Function *ReduceFn) {
-  ASTContext &C = CGM.getContext();
-
-  // Buffer: global reduction buffer.
-  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                              C.VoidPtrTy, ImplicitParamKind::Other);
-  // Idx: index of the buffer.
-  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
-                           ImplicitParamKind::Other);
-  // ReduceList: thread local Reduce list.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&BufferArg);
-  Args.push_back(&IdxArg);
-  Args.push_back(&ReduceListArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
-  QualType StaticTy = C.getRecordType(TeamReductionRec);
-  llvm::Type *LLVMReductionsBufferTy =
-      CGM.getTypes().ConvertTypeForMem(StaticTy);
-  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
-      LLVMReductionsBufferTy->getPointerTo());
-
-  // 1. Build a list of reduction variables.
-  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
-  Address ReductionList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
-  auto IPriv = Privates.begin();
-  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
-                                              /*Volatile=*/false, C.IntTy,
-                                              Loc)};
-  unsigned Idx = 0;
-  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
-    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-    // Global = Buffer.VD[Idx];
-    const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
-    const FieldDecl *FD = VarFieldMap.lookup(VD);
-    llvm::Value *BufferPtr =
-        Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
-    LValue GlobLVal = CGF.EmitLValueForField(
-        CGF.MakeNaturalAlignAddrLValue(BufferPtr, StaticTy), FD);
-    Address GlobAddr = GlobLVal.getAddress(CGF);
-    CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false,
-                          C.VoidPtrTy);
-    if ((*IPriv)->getType()->isVariablyModifiedType()) {
-      // Store array size.
-      ++Idx;
-      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-      llvm::Value *Size = CGF.Builder.CreateIntCast(
-          CGF.getVLASize(
-                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
-              .NumElts,
-          CGF.SizeTy, /*isSigned=*/false);
-      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
-                              Elem);
-    }
-  }
-
-  // Call reduce_function(ReduceList, GlobalReduceList)
-  llvm::Value *GlobalReduceList = ReductionList.getPointer();
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
-      AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
-  CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
-      CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
-  CGF.FinishFunction();
-  return Fn;
-}
-
 ///
 /// Design of OpenMP reductions on the GPU
 ///
@@ -2805,10 +1654,13 @@ void CGOpenMPRuntimeGPU::emitReduction(
     return;
 
   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
+  bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
 #ifndef NDEBUG
   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
 #endif
 
+  ASTContext &C = CGM.getContext();
+
   if (Options.SimpleReduction) {
     assert(!TeamsReduction && !ParallelReduction &&
            "Invalid reduction selection in emitReduction.");
@@ -2817,155 +1669,78 @@ void CGOpenMPRuntimeGPU::emitReduction(
     return;
   }
 
-  assert((TeamsReduction || ParallelReduction) &&
-         "Invalid reduction selection in emitReduction.");
-
-  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
-  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
-  int Cnt = 0;
-  for (const Expr *DRE : Privates) {
-    PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
-    ++Cnt;
-  }
-
-  ASTContext &C = CGM.getContext();
-  const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
-      CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
-
-  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
-  // RedList, shuffle_reduce_func, interwarp_copy_func);
-  // or
-  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+  // Source location for theident struct
   llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
 
-  llvm::Value *Res;
-  // 1. Build a list of reduction variables.
-  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
-  auto Size = RHSExprs.size();
-  for (const Expr *E : Privates) {
-    if (E->getType()->isVariablyModifiedType())
-      // Reserve place for array size.
-      ++Size;
-  }
-  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
-  QualType ReductionArrayTy = C.getConstantArrayType(
-      C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
-      /*IndexTypeQuals=*/0);
-  Address ReductionList =
-      CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
-  auto IPriv = Privates.begin();
-  unsigned Idx = 0;
-  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
-    Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-    CGF.Builder.CreateStore(
-        CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-            CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
-        Elem);
-    if ((*IPriv)->getType()->isVariablyModifiedType()) {
-      // Store array size.
-      ++Idx;
-      Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
-      llvm::Value *Size = CGF.Builder.CreateIntCast(
-          CGF.getVLASize(
-                 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
-              .NumElts,
-          CGF.SizeTy, /*isSigned=*/false);
-      CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
-                              Elem);
-    }
-  }
-
-  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReductionList.getPointer(), CGF.VoidPtrTy);
-  llvm::Function *ReductionFn = emitReductionFunction(
-      CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
-      Privates, LHSExprs, RHSExprs, ReductionOps);
-  llvm::Value *ReductionDataSize =
-      CGF.getTypeSize(C.getRecordType(ReductionRec));
-  ReductionDataSize =
-      CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
-  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
-      CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
-  llvm::Value *InterWarpCopyFn =
-      emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
-
-  if (ParallelReduction) {
-    llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
-                           InterWarpCopyFn};
-
-    Res = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
-        Args);
-  } else {
-    assert(TeamsReduction && "expected teams reduction.");
-    TeamsReductions.push_back(ReductionRec);
-    auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
-        {}, "_openmp_teams_reductions_buffer_$_$ptr");
-    llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
-    llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
-        ReductionFn);
-    llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
-    llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
-        CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
-        ReductionFn);
-
-    llvm::Value *Args[] = {
-        RTLoc,
-        KernelTeamsReductionPtr,
-        CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
-        ReductionDataSize,
-        RL,
-        ShuffleAndReduceFn,
-        InterWarpCopyFn,
-        GlobalToBufferCpyFn,
-        GlobalToBufferRedFn,
-        BufferToGlobalCpyFn,
-        BufferToGlobalRedFn};
-
-    Res = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
-        Args);
-  }
+  using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+  InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
+                         CGF.AllocaInsertPt->getIterator());
+  InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
+                          CGF.Builder.GetInsertPoint());
+  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
+  llvm::SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos;
 
-  // 5. Build if (res == 1)
-  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
-  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
-  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
-      Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
-  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
-
-  // 6. Build then branch: where we have reduced values in the master
-  //    thread in each team.
-  //    __kmpc_end_reduce{_nowait}(<gtid>);
-  //    break;
-  CGF.EmitBlock(ThenBB);
-
-  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
-  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
-                    this](CodeGenFunction &CGF, PrePostActionTy &Action) {
-    auto IPriv = Privates.begin();
-    auto ILHS = LHSExprs.begin();
-    auto IRHS = RHSExprs.begin();
-    for (const Expr *E : ReductionOps) {
-      emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
-                                  cast<DeclRefExpr>(*IRHS));
-      ++IPriv;
-      ++ILHS;
-      ++IRHS;
+  CodeGenFunction::OMPPrivateScope Scope(CGF);
+  unsigned Idx = 0;
+  for (const Expr *Private : Privates) {
+    llvm::Type *ElementType;
+    llvm::Value *Variable;
+    llvm::Value *PrivateVariable;
+    llvm::OpenMPIRBuilder::AtomicReductionGenCB AtomicReductionGen = nullptr;
+    ElementType = CGF.ConvertTypeForMem(Private->getType());
+    const auto *RHSVar =
+        cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
+    PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).getPointer();
+    const auto *LHSVar =
+        cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
+    Variable = CGF.GetAddrOfLocalVar(LHSVar).getPointer();
+    llvm::OpenMPIRBuilder::EvaluationKindTy EvalKind;
+    switch (CGF.getEvaluationKind(Private->getType())) {
+    case TEK_Scalar:
+      EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar;
+      break;
+    case TEK_Complex:
+      EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Complex;
+      break;
+    case TEK_Aggregate:
+      EvalKind = llvm::OpenMPIRBuilder::EvaluationKindTy::Aggregate;
+      break;
     }
-  };
-  RegionCodeGenTy RCG(CodeGen);
-  RCG(CGF);
-  // There is no need to emit line number for unconditional branch.
-  (void)ApplyDebugLocation::CreateEmpty(CGF);
-  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
+    auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
+                            llvm::Value **LHSPtr, llvm::Value **RHSPtr,
+                            llvm::Function *NewFunc) {
+      CGF.Builder.restoreIP(CodeGenIP);
+      auto *CurFn = CGF.CurFn;
+      CGF.CurFn = NewFunc;
+
+      *LHSPtr = CGF.GetAddrOfLocalVar(
+                       cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
+                    .getPointer();
+      *RHSPtr = CGF.GetAddrOfLocalVar(
+                       cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
+                    .getPointer();
+
+      emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
+                                  cast<DeclRefExpr>(LHSExprs[I]),
+                                  cast<DeclRefExpr>(RHSExprs[I]));
+
+      CGF.CurFn = CurFn;
+
+      return InsertPointTy(CGF.Builder.GetInsertBlock(),
+                           CGF.Builder.GetInsertPoint());
+    };
+    ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
+        ElementType, Variable, PrivateVariable, EvalKind,
+        /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
+    Idx++;
+  }
+
+  CGF.Builder.restoreIP(OMPBuilder.createReductionsGPU(
+      OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
+      DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBTy::Clang,
+      CGF.getTarget().getGridValue(), C.getLangOpts().OpenMPCUDAReductionBufNum,
+      RTLoc));
+  return;
 }
 
 const VarDecl *
@@ -3560,10 +2335,3 @@ llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
           CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
       Args);
 }
-
-llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
-  ArrayRef<llvm::Value *> Args{};
-  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-                                 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
-                             Args);
-}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 141436f26230dd..4d586ec972f8d6 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -150,9 +150,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
       CodeGenFunction &CGF,
       const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) override;
 
-  /// Get the GPU warp size.
-  llvm::Value *getGPUWarpSize(CodeGenFunction &CGF);
-
   /// Get the id of the current thread on the GPU.
   llvm::Value *getGPUThreadID(CodeGenFunction &CGF);
 
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index fac42732022ad8..567598250301fd 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -36,14 +36,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA14:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -66,78 +66,78 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA16:![0-9]+]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR12:[0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
@@ -161,9 +161,9 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -197,79 +197,79 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3:[0-9]+]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -278,28 +278,28 @@ void test() {
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to float
-// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to float
-// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP15]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
@@ -309,25 +309,25 @@ void test() {
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -358,53 +358,53 @@ void test() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(ptr nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(ptr nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA20:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
+// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR2:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 4
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21:![0-9]+]]
-// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 8
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr %"class.std::complex", ptr [[TMP9]], i64 1
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP9]], align 4
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_get_warp_size()
 // CHECK1-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i16
 // CHECK1-NEXT:    [[TMP15:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP12]], i16 [[TMP6]], i16 [[TMP14]])
-// CHECK1-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
+// CHECK1-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 8
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP9]], i64 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[DOTOMP_REDUCTION_ELEMENT]], i64 1
-// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = icmp eq i16 [[TMP7]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp eq i16 [[TMP7]], 1
 // CHECK1-NEXT:    [[TMP20:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
@@ -433,7 +433,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[TMP35]], align 8
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP36]], ptr align 4 [[TMP34]], i64 8, i1 false), !tbaa.struct [[TBAA_STRUCT23:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP36]], ptr align 8 [[TMP34]], i64 8, i1 false)
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
 // CHECK1-NEXT:    br label [[IFCONT6]]
@@ -442,24 +442,24 @@ void test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK1-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
@@ -468,7 +468,7 @@ void test() {
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
@@ -478,45 +478,45 @@ void test() {
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK1:       then2:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK1:       else3:
 // CHECK1-NEXT:    br label [[IFCONT4]]
 // CHECK1:       ifcont4:
 // CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper
-// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
@@ -527,14 +527,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -557,78 +557,78 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 16)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA24:![0-9]+]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
@@ -652,9 +652,9 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -688,79 +688,79 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -769,28 +769,28 @@ void test() {
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to double
-// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to double
-// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP15]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
@@ -800,25 +800,25 @@ void test() {
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP33]], align 8
@@ -849,26 +849,26 @@ void test() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(ptr nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA26:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8
 // CHECK1-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(ptr nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA28:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
+// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func1
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i16 noundef signext [[TMP1:%.*]], i16 noundef signext [[TMP2:%.*]], i16 noundef signext [[TMP3:%.*]]) #[[ATTR7]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i16, align 2
@@ -876,14 +876,14 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.0", align 8
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
@@ -908,7 +908,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
 // CHECK1-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
 // CHECK1:       .shuffle.exit:
-// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
 // CHECK1-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
@@ -937,7 +937,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP43]], ptr align 8 [[TMP41]], i64 16, i1 false), !tbaa.struct [[TBAA_STRUCT29:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP43]], ptr align 8 [[TMP41]], i64 16, i1 false)
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
 // CHECK1-NEXT:    br label [[IFCONT6]]
@@ -946,24 +946,24 @@ void test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func2
-// CHECK1-SAME: (ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], i32 noundef [[TMP1:%.*]]) #[[ATTR7]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 4
 // CHECK1-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
@@ -972,7 +972,7 @@ void test() {
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
@@ -982,45 +982,45 @@ void test() {
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK1:       then2:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
+// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK1:       else3:
 // CHECK1-NEXT:    br label [[IFCONT4]]
 // CHECK1:       ifcont4:
 // CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper
-// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR7]] {
+// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR8]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA21]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
@@ -1031,18 +1031,18 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA16]]
-// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4, !tbaa [[TBAA16]]
-// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4
+// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1050,10 +1050,10 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
@@ -1061,10 +1061,10 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
@@ -1074,18 +1074,18 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA24]]
-// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8, !tbaa [[TBAA24]]
-// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8
+// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1093,10 +1093,10 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
 //
@@ -1104,9 +1104,9 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index 360a780c75383c..e7beb174872321 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -579,7 +579,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK1-NEXT:    store i8 [[TMP9]], ptr [[C]], align 4
+// CHECK1-NEXT:    store i8 [[TMP9]], ptr [[C]], align 1
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 [[TMP5]]
@@ -630,7 +630,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK1-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 1
 // CHECK1-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
@@ -1156,7 +1156,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
-// CHECK1-NEXT:    store i16 [[TMP13]], ptr [[B]], align 4
+// CHECK1-NEXT:    store i16 [[TMP13]], ptr [[B]], align 2
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1207,7 +1207,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 2
 // CHECK1-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
 // CHECK1-NEXT:    ret void
 //
@@ -1766,7 +1766,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK2-NEXT:    store i8 [[TMP9]], ptr [[C]], align 4
+// CHECK2-NEXT:    store i8 [[TMP9]], ptr [[C]], align 1
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 [[TMP5]]
@@ -1817,7 +1817,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK2-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 1
 // CHECK2-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
@@ -2343,7 +2343,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
-// CHECK2-NEXT:    store i16 [[TMP13]], ptr [[B]], align 4
+// CHECK2-NEXT:    store i16 [[TMP13]], ptr [[B]], align 2
 // CHECK2-NEXT:    ret void
 //
 //
@@ -2394,7 +2394,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 2
 // CHECK2-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
 // CHECK2-NEXT:    ret void
 //
@@ -2953,7 +2953,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK3-NEXT:    store i8 [[TMP9]], ptr [[C]], align 4
+// CHECK3-NEXT:    store i8 [[TMP9]], ptr [[C]], align 1
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP4]], i32 [[TMP5]]
@@ -3004,7 +3004,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[C]], align 1
 // CHECK3-NEXT:    store i8 [[TMP9]], ptr [[TMP7]], align 1
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
@@ -3530,7 +3530,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
-// CHECK3-NEXT:    store i16 [[TMP13]], ptr [[B]], align 4
+// CHECK3-NEXT:    store i16 [[TMP13]], ptr [[B]], align 2
 // CHECK3-NEXT:    ret void
 //
 //
@@ -3581,7 +3581,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP12]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B]], align 2
 // CHECK3-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
 // CHECK3-NEXT:    ret void
 //
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index 3b1af7618794db..e8f62922e6890e 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -1197,6 +1197,7 @@ int foo() {
 // IR-GPU-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
@@ -1214,7 +1215,6 @@ int foo() {
 // IR-GPU-NEXT:    [[J4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J4]] to ptr
 // IR-GPU-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
@@ -1370,6 +1370,7 @@ int foo() {
 // IR-GPU-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[J5:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    [[DOTGLOBAL_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTGLOBAL_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTBOUND_TID__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTBOUND_TID__ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTPREVIOUS_LB__ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTPREVIOUS_LB__ADDR]] to ptr
@@ -1387,7 +1388,6 @@ int foo() {
 // IR-GPU-NEXT:    [[SUM4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM4]] to ptr
 // IR-GPU-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
 // IR-GPU-NEXT:    [[J5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J5]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_RED_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_RED_LIST]] to ptr
 // IR-GPU-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i64 [[DOTPREVIOUS_LB_]], ptr [[DOTPREVIOUS_LB__ADDR_ASCAST]], align 8
@@ -1511,12 +1511,12 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -1592,10 +1592,10 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
@@ -1656,12 +1656,12 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca [10 x [10 x i32]], align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR2]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR3]] to ptr
 // IR-GPU-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to ptr
-// IR-GPU-NEXT:    [[DOTOMP_REDUCTION_ELEMENT_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_REDUCTION_ELEMENT]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 2
 // IR-GPU-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2_ASCAST]], align 2
@@ -1737,10 +1737,10 @@ int foo() {
 // IR-GPU-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr))
 // IR-GPU-NEXT:    [[DOTADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTADDR1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTADDR1]] to ptr
-// IR-GPU-NEXT:    [[DOTCNT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTCNT_ADDR]] to ptr
 // IR-GPU-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1_ASCAST]], align 4
 // IR-GPU-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index e940d9f655c869..abf05cc1e5e76a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -100,6 +100,9 @@ class OpenMPIRBuilderConfig {
   /// expanded.
   std::optional<bool> IsGPU;
 
+  /// Flag for specifying if LLVMUsed information should be emitted.
+  std::optional<bool> EmitLLVMUsed;
+
   /// Flag for specifying if offloading is mandatory.
   std::optional<bool> OpenMPOffloadMandatory;
 
@@ -178,6 +181,7 @@ class OpenMPIRBuilderConfig {
 
   void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
   void setIsGPU(bool Value) { IsGPU = Value; }
+  void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsed = Value; }
   void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
   void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
   void setSeparator(StringRef S) { Separator = S; }
@@ -620,15 +624,17 @@ class OpenMPIRBuilder {
   /// Generator for '#omp barrier'
   ///
   /// \param Loc The location where the barrier directive was encountered.
-  /// \param DK The kind of directive that caused the barrier.
+  /// \param Kind The kind of directive that caused the barrier.
   /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
   /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
   ///                        should be checked and acted upon.
+  /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
   ///
   /// \returns The insertion point after the barrier.
   InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK,
                               bool ForceSimpleCall = false,
-                              bool CheckCancelFlag = true);
+                              bool CheckCancelFlag = true,
+                              Value *ThreadID = nullptr);
 
   /// Generator for '#omp cancel'
   ///
@@ -1248,39 +1254,56 @@ class OpenMPIRBuilder {
   getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
                            StringRef ParentName = "");
 
-  // using ReductionGenTy =
-  //     function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
+  /// Enum class for the RedctionGen CallBack type to be used.
+  enum class ReductionGenCBTy { Clang, MLIR };
 
-  // using AtomicReductionGenTy =
-  //     function_ref<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+  /// ReductionGen CallBack for Clang
+  ///
+  /// \param CodeGenIP InsertPoint for CodeGen.
+  /// \param Index Index of the ReductionInfo to generate code for.
+  /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
+  /// codegen, used for fixup later.
+  /// \param RHSPtr Optionally used by Clang to
+  /// return the RHSPtr it used for codegen, used for fixup later.
+  /// \param CurFn Optionally used by Clang to pass in the Current Function as
+  /// Clang context may be old.
+  using ReductionGenCBClang =
+      std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
+                                  Value **LHS, Value **RHS, Function *CurFn)>;
+
+  /// ReductionGen CallBack for MLIR
+  ///
+  /// \param CodeGenIP InsertPoint for CodeGen.
+  /// \param LHS Pass in the LHS Value to be used for CodeGen.
+  /// \param RHS Pass in the RHS Value to be used for CodeGen.
+  using ReductionGenCB = std::function<InsertPointTy(
+      InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
 
-  /// Owning equivalents of OpenMPIRBuilder::(Atomic)ReductionGen that are used
-  /// to
-  /// store lambdas with capture.
-  /// Functions used to generate reductions. Such functions take two Values
-  /// representing LHS and RHS of the reduction, respectively, and a reference
-  /// to the value that is updated to refer to the reduction result.
-  using ReductionGenTy = std::function<OpenMPIRBuilder::InsertPointTy(
-      OpenMPIRBuilder::InsertPointTy, Value *, Value *, Value *&)>;
   /// Functions used to generate atomic reductions. Such functions take two
   /// Values representing pointers to LHS and RHS of the reduction, as well as
   /// the element type of these pointers. They are expected to atomically
   /// update the LHS to the reduced value.
-  using AtomicReductionGenTy = std::function<OpenMPIRBuilder::InsertPointTy(
-      OpenMPIRBuilder::InsertPointTy, Type *, Value *, Value *)>;
+  using AtomicReductionGenCB =
+      std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
+
+  /// Enum class for reduction evaluation types scalar, complex and aggregate.
+  enum class EvaluationKindTy { Scalar, Complex, Aggregate };
 
   /// Information about an OpenMP reduction.
   struct ReductionInfo {
     ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
-                  ReductionGenTy ReductionGen,
-                  AtomicReductionGenTy AtomicReductionGen)
+                  EvaluationKindTy EvaluationKind, ReductionGenCB ReductionGen,
+                  ReductionGenCBClang ReductionGenClang,
+                  AtomicReductionGenCB AtomicReductionGen)
         : ElementType(ElementType), Variable(Variable),
-          PrivateVariable(PrivateVariable), ReductionGen(ReductionGen),
+          PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
+          ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
           AtomicReductionGen(AtomicReductionGen) {}
     ReductionInfo(Value *PrivateVariable)
         : ElementType(nullptr), Variable(nullptr),
-          PrivateVariable(PrivateVariable), ReductionGen(),
-          AtomicReductionGen() {}
+          PrivateVariable(PrivateVariable),
+          EvaluationKind(EvaluationKindTy::Scalar), ReductionGen(),
+          ReductionGenClang(), AtomicReductionGen() {}
 
     /// Reduction element type, must match pointee type of variable.
     Type *ElementType;
@@ -1291,22 +1314,43 @@ class OpenMPIRBuilder {
     /// Thread-private partial reduction variable.
     Value *PrivateVariable;
 
+    /// Reduction evaluation type - scalar, complex or aggregate.
+    EvaluationKindTy EvaluationKind;
+
     /// Callback for generating the reduction body. The IR produced by this will
     /// be used to combine two values in a thread-safe context, e.g., under
     /// lock or within the same thread, and therefore need not be atomic.
-    ReductionGenTy ReductionGen;
+    ReductionGenCB ReductionGen;
+
+    /// Clang callback for generating the reduction body. The IR produced by
+    /// this will be used to combine two values in a thread-safe context, e.g.,
+    /// under lock or within the same thread, and therefore need not be atomic.
+    ReductionGenCBClang ReductionGenClang;
 
     /// Callback for generating the atomic reduction body, may be null. The IR
     /// produced by this will be used to atomically combine two values during
     /// reduction. If null, the implementation will use the non-atomic version
     /// along with the appropriate synchronization mechanisms.
-    AtomicReductionGenTy AtomicReductionGen;
+    AtomicReductionGenCB AtomicReductionGen;
+  };
+
+  enum class CopyAction : unsigned {
+    // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
+    // the warp using shuffle instructions.
+    RemoteLaneToThread,
+    // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
+    ThreadCopy,
+  };
+
+  struct CopyOptionsTy {
+    Value *RemoteLaneOffset = nullptr;
+    Value *ScratchpadIndex = nullptr;
+    Value *ScratchpadWidth = nullptr;
   };
 
   /// A class that manages the reduction info to facilitate lowering of
   /// reductions at multiple levels of parallelism. For example handling teams
   /// and parallel reductions on GPUs
-
   class ReductionInfoManager {
   private:
     SmallVector<ReductionInfo> ReductionInfos;
@@ -1319,15 +1363,12 @@ class OpenMPIRBuilder {
       PrivateVarAllocaIP.reset();
     }
 
-    Value *
-    allocatePrivateReductionVar(IRBuilderBase &builder,
-                                llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
-                                Type *VarType) {
-      llvm::Type *ptrTy = llvm::PointerType::getUnqual(builder.getContext());
-      llvm::Value *var = builder.CreateAlloca(VarType);
+    Value *allocatePrivateReductionVar(IRBuilderBase &builder,
+                                       InsertPointTy &allocaIP, Type *VarType) {
+      Type *ptrTy = PointerType::getUnqual(builder.getContext());
+      Value *var = builder.CreateAlloca(VarType);
       var->setName("private_redvar");
-      llvm::Value *castVar =
-          builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
+      Value *castVar = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
       ReductionInfos.push_back(ReductionInfo(castVar));
       return castVar;
     }
@@ -1351,20 +1392,508 @@ class OpenMPIRBuilder {
     void setPrivateVarAllocaIP(InsertPointTy IP) { PrivateVarAllocaIP = IP; }
   };
 
+  /// Supporting functions for Reductions CodeGen.
+private:
+  /// Emit the llvm.used metadata.
+  void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);
+
+  /// Get the id of the current thread on the GPU.
+  Value *getGPUThreadID();
+
+  /// Get the GPU warp size.
+  Value *getGPUWarpSize();
+
+  /// Get the id of the warp in the block.
+  /// We assume that the warp size is 32, which is always the case
+  /// on the NVPTX device, to generate more efficient code.
+  Value *getNVPTXWarpID();
+
+  /// Get the id of the current lane in the Warp.
+  /// We assume that the warp size is 32, which is always the case
+  /// on the NVPTX device, to generate more efficient code.
+  Value *getNVPTXLaneID();
+
+  /// Cast value to the specified type.
+  Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
+
+  /// This function creates calls to one of two shuffle functions to copy
+  /// variables between lanes in a warp.
+  Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
+                                      Type *ElementType, Value *Offset);
+
+  void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
+                       Type *ElementType, Value *Offset,
+                       Type *ReductionArrayTy);
+
+  /// Emit instructions to copy a Reduce list, which contains partially
+  /// aggregated values, in the specified direction.
+  void emitReductionListCopy(
+      InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+      ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+      CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
+
+  /// Emit a helper that reduces data across two OpenMP threads (lanes)
+  /// in the same warp.  It uses shuffle instructions to copy over data from
+  /// a remote lane's stack.  The reduction algorithm performed is specified
+  /// by the fourth parameter.
+  ///
+  /// Algorithm Versions.
+  /// Full Warp Reduce (argument value 0):
+  ///   This algorithm assumes that all 32 lanes are active and gathers
+  ///   data from these 32 lanes, producing a single resultant value.
+  /// Contiguous Partial Warp Reduce (argument value 1):
+  ///   This algorithm assumes that only a *contiguous* subset of lanes
+  ///   are active.  This happens for the last warp in a parallel region
+  ///   when the user specified num_threads is not an integer multiple of
+  ///   32.  This contiguous subset always starts with the zeroth lane.
+  /// Partial Warp Reduce (argument value 2):
+  ///   This algorithm gathers data from any number of lanes at any position.
+  /// All reduced values are stored in the lowest possible lane.  The set
+  /// of problems every algorithm addresses is a super set of those
+  /// addressable by algorithms with a lower version number.  Overhead
+  /// increases as algorithm version increases.
+  ///
+  /// Terminology
+  /// Reduce element:
+  ///   Reduce element refers to the individual data field with primitive
+  ///   data types to be combined and reduced across threads.
+  /// Reduce list:
+  ///   Reduce list refers to a collection of local, thread-private
+  ///   reduce elements.
+  /// Remote Reduce list:
+  ///   Remote Reduce list refers to a collection of remote (relative to
+  ///   the current thread) reduce elements.
+  ///
+  /// We distinguish between three states of threads that are important to
+  /// the implementation of this function.
+  /// Alive threads:
+  ///   Threads in a warp executing the SIMT instruction, as distinguished from
+  ///   threads that are inactive due to divergent control flow.
+  /// Active threads:
+  ///   The minimal set of threads that has to be alive upon entry to this
+  ///   function.  The computation is correct iff active threads are alive.
+  ///   Some threads are alive but they are not active because they do not
+  ///   contribute to the computation in any useful manner.  Turning them off
+  ///   may introduce control flow overheads without any tangible benefits.
+  /// Effective threads:
+  ///   In order to comply with the argument requirements of the shuffle
+  ///   function, we must keep all lanes holding data alive.  But at most
+  ///   half of them perform value aggregation; we refer to this half of
+  ///   threads as effective. The other half is simply handing off their
+  ///   data.
+  ///
+  /// Procedure
+  /// Value shuffle:
+  ///   In this step active threads transfer data from higher lane positions
+  ///   in the warp to lower lane positions, creating Remote Reduce list.
+  /// Value aggregation:
+  ///   In this step, effective threads combine their thread local Reduce list
+  ///   with Remote Reduce list and store the result in the thread local
+  ///   Reduce list.
+  /// Value copy:
+  ///   In this step, we deal with the assumption made by algorithm 2
+  ///   (i.e. contiguity assumption).  When we have an odd number of lanes
+  ///   active, say 2k+1, only k threads will be effective and therefore k
+  ///   new values will be produced.  However, the Reduce list owned by the
+  ///   (2k+1)th thread is ignored in the value aggregation.  Therefore
+  ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
+  ///   that the contiguity assumption still holds.
+  ///
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param ReduceFn The reduction function.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The ShuffleAndReduce function.
+  Function *emitShuffleAndReduceFunction(
+      ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+      Function *ReduceFn, AttributeList FuncAttrs);
+
+  /// This function emits a helper that gathers Reduce lists from the first
+  /// lane of every active warp to lanes in the first warp.
+  ///
+  /// void inter_warp_copy_func(void* reduce_data, num_warps)
+  ///   shared smem[warp_size];
+  ///   For all data entries D in reduce_data:
+  ///     sync
+  ///     If (I am the first lane in each warp)
+  ///       Copy my local D to smem[warp_id]
+  ///     sync
+  ///     if (I am the first warp)
+  ///       Copy smem[thread_id] to my local D
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The InterWarpCopy function.
+  Function *emitInterWarpCopyFunction(const LocationDescription &Loc,
+                                      ArrayRef<ReductionInfo> ReductionInfos,
+                                      AttributeList FuncAttrs);
+
+  /// This function emits a helper that copies all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+  ///   For all data entries D in reduce_data:
+  ///     Copy local D to buffer.D[Idx]
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The ListToGlobalCopy function.
+  Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                         Type *ReductionsBufferTy,
+                                         AttributeList FuncAttrs);
+
+  /// This function emits a helper that copies all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
+  ///   For all data entries D in reduce_data:
+  ///     Copy buffer.D[Idx] to local D;
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The GlobalToList function.
+  Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                         Type *ReductionsBufferTy,
+                                         AttributeList FuncAttrs);
+
+  /// This function emits a helper that reduces all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
+  ///  void *GlobPtrs[];
+  ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
+  ///  ...
+  ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
+  ///  reduce_function(GlobPtrs, reduce_data);
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The ListToGlobalReduce function.
+  Function *
+  emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                 Function *ReduceFn, Type *ReductionsBufferTy,
+                                 AttributeList FuncAttrs);
+
+  /// This function emits a helper that reduces all the reduction variables from
+  /// the team into the provided global buffer for the reduction variables.
+  ///
+  /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
+  ///  void *GlobPtrs[];
+  ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
+  ///  ...
+  ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
+  ///  reduce_function(reduce_data, GlobPtrs);
+  ///
+  /// \param Loc The insert and source location description.
+  /// \param ReductionsBufferTy The StructTy for the reductions buffer.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The GlobalToListReduce function.
+  Function *
+  emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
+                                 Function *ReduceFn, Type *ReductionsBufferTy,
+                                 AttributeList FuncAttrs);
+
+  /// Get the function name of a reduction function.
+  std::string getReductionFuncName(StringRef Name) const;
+
+  /// Emits reduction function.
+  /// \param ReducerName Name of the function calling the reduction.
+  /// \param ReductionInfos Array type containing the ReductionOps.
+  /// \param IsGpu Optional param to specify CodeGen for GPU Offloading.
+  /// \param FuncAttrs Optional param to specify any function attributes that
+  ///                  need to be copied to the new function.
+  ///
+  /// \return The reduction function.
+  Function *createReductionFunction(
+      StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
+      bool IsGpu = false,
+      ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+      AttributeList FuncAttrs = {});
+
+public:
+  ///
+  /// Design of OpenMP reductions on the GPU
+  ///
+  /// Consider a typical OpenMP program with one or more reduction
+  /// clauses:
+  ///
+  /// float foo;
+  /// double bar;
+  /// #pragma omp target teams distribute parallel for \
+  ///             reduction(+:foo) reduction(*:bar)
+  /// for (int i = 0; i < N; i++) {
+  ///   foo += A[i]; bar *= B[i];
+  /// }
+  ///
+  /// where 'foo' and 'bar' are reduced across all OpenMP threads in
+  /// all teams.  In our OpenMP implementation on the NVPTX device an
+  /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
+  /// within a team are mapped to CUDA threads within a threadblock.
+  /// Our goal is to efficiently aggregate values across all OpenMP
+  /// threads such that:
+  ///
+  ///   - the compiler and runtime are logically concise, and
+  ///   - the reduction is performed efficiently in a hierarchical
+  ///     manner as follows: within OpenMP threads in the same warp,
+  ///     across warps in a threadblock, and finally across teams on
+  ///     the NVPTX device.
+  ///
+  /// Introduction to Decoupling
+  ///
+  /// We would like to decouple the compiler and the runtime so that the
+  /// latter is ignorant of the reduction variables (number, data types)
+  /// and the reduction operators.  This allows a simpler interface
+  /// and implementation while still attaining good performance.
+  ///
+  /// Pseudocode for the aforementioned OpenMP program generated by the
+  /// compiler is as follows:
+  ///
+  /// 1. Create private copies of reduction variables on each OpenMP
+  ///    thread: 'foo_private', 'bar_private'
+  /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
+  ///    to it and writes the result in 'foo_private' and 'bar_private'
+  ///    respectively.
+  /// 3. Call the OpenMP runtime on the GPU to reduce within a team
+  ///    and store the result on the team master:
+  ///
+  ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,
+  ///        reduceData, shuffleReduceFn, interWarpCpyFn)
+  ///
+  ///     where:
+  ///       struct ReduceData {
+  ///         double *foo;
+  ///         double *bar;
+  ///       } reduceData
+  ///       reduceData.foo = &foo_private
+  ///       reduceData.bar = &bar_private
+  ///
+  ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
+  ///     auxiliary functions generated by the compiler that operate on
+  ///     variables of type 'ReduceData'.  They aid the runtime perform
+  ///     algorithmic steps in a data agnostic manner.
+  ///
+  ///     'shuffleReduceFn' is a pointer to a function that reduces data
+  ///     of type 'ReduceData' across two OpenMP threads (lanes) in the
+  ///     same warp.  It takes the following arguments as input:
+  ///
+  ///     a. variable of type 'ReduceData' on the calling lane,
+  ///     b. its lane_id,
+  ///     c. an offset relative to the current lane_id to generate a
+  ///        remote_lane_id.  The remote lane contains the second
+  ///        variable of type 'ReduceData' that is to be reduced.
+  ///     d. an algorithm version parameter determining which reduction
+  ///        algorithm to use.
+  ///
+  ///     'shuffleReduceFn' retrieves data from the remote lane using
+  ///     efficient GPU shuffle intrinsics and reduces, using the
+  ///     algorithm specified by the 4th parameter, the two operands
+  ///     element-wise.  The result is written to the first operand.
+  ///
+  ///     Different reduction algorithms are implemented in different
+  ///     runtime functions, all calling 'shuffleReduceFn' to perform
+  ///     the essential reduction step.  Therefore, based on the 4th
+  ///     parameter, this function behaves slightly differently to
+  ///     cooperate with the runtime to ensure correctness under
+  ///     different circumstances.
+  ///
+  ///     'InterWarpCpyFn' is a pointer to a function that transfers
+  ///     reduced variables across warps.  It tunnels, through CUDA
+  ///     shared memory, the thread-private data of type 'ReduceData'
+  ///     from lane 0 of each warp to a lane in the first warp.
+  /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
+  ///    The last team writes the global reduced value to memory.
+  ///
+  ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,
+  ///             reduceData, shuffleReduceFn, interWarpCpyFn,
+  ///             scratchpadCopyFn, loadAndReduceFn)
+  ///
+  ///     'scratchpadCopyFn' is a helper that stores reduced
+  ///     data from the team master to a scratchpad array in
+  ///     global memory.
+  ///
+  ///     'loadAndReduceFn' is a helper that loads data from
+  ///     the scratchpad array and reduces it with the input
+  ///     operand.
+  ///
+  ///     These compiler generated functions hide address
+  ///     calculation and alignment information from the runtime.
+  /// 5. if ret == 1:
+  ///     The team master of the last team stores the reduced
+  ///     result to the globals in memory.
+  ///     foo += reduceData.foo; bar *= reduceData.bar
+  ///
+  ///
+  /// Warp Reduction Algorithms
+  ///
+  /// On the warp level, we have three algorithms implemented in the
+  /// OpenMP runtime depending on the number of active lanes:
+  ///
+  /// Full Warp Reduction
+  ///
+  /// The reduce algorithm within a warp where all lanes are active
+  /// is implemented in the runtime as follows:
+  ///
+  /// full_warp_reduce(void *reduce_data,
+  ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
+  ///     ShuffleReduceFn(reduce_data, 0, offset, 0);
+  /// }
+  ///
+  /// The algorithm completes in log(2, WARPSIZE) steps.
+  ///
+  /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
+  /// not used therefore we save instructions by not retrieving lane_id
+  /// from the corresponding special registers.  The 4th parameter, which
+  /// represents the version of the algorithm being used, is set to 0 to
+  /// signify full warp reduction.
+  ///
+  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+  ///
+  /// #reduce_elem refers to an element in the local lane's data structure
+  /// #remote_elem is retrieved from a remote lane
+  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+  /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
+  ///
+  /// Contiguous Partial Warp Reduction
+  ///
+  /// This reduce algorithm is used within a warp where only the first
+  /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the
+  /// number of OpenMP threads in a parallel region is not a multiple of
+  /// WARPSIZE.  The algorithm is implemented in the runtime as follows:
+  ///
+  /// void
+  /// contiguous_partial_reduce(void *reduce_data,
+  ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,
+  ///                           int size, int lane_id) {
+  ///   int curr_size;
+  ///   int offset;
+  ///   curr_size = size;
+  ///   mask = curr_size/2;
+  ///   while (offset>0) {
+  ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);
+  ///     curr_size = (curr_size+1)/2;
+  ///     offset = curr_size/2;
+  ///   }
+  /// }
+  ///
+  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+  ///
+  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+  /// if (lane_id < offset)
+  ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
+  /// else
+  ///     reduce_elem = remote_elem
+  ///
+  /// This algorithm assumes that the data to be reduced are located in a
+  /// contiguous subset of lanes starting from the first.  When there is
+  /// an odd number of active lanes, the data in the last lane is not
+  /// aggregated with any other lane's dat but is instead copied over.
+  ///
+  /// Dispersed Partial Warp Reduction
+  ///
+  /// This algorithm is used within a warp when any discontiguous subset of
+  /// lanes are active.  It is used to implement the reduction operation
+  /// across lanes in an OpenMP simd region or in a nested parallel region.
+  ///
+  /// void
+  /// dispersed_partial_reduce(void *reduce_data,
+  ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {
+  ///   int size, remote_id;
+  ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;
+  ///   do {
+  ///       remote_id = next_active_lane_id_right_after_me();
+  ///       # the above function returns 0 of no active lane
+  ///       # is present right after the current lane.
+  ///       size = number_of_active_lanes_in_this_warp();
+  ///       logical_lane_id /= 2;
+  ///       ShuffleReduceFn(reduce_data, logical_lane_id,
+  ///                       remote_id-1-threadIdx.x, 2);
+  ///   } while (logical_lane_id % 2 == 0 && size > 1);
+  /// }
+  ///
+  /// There is no assumption made about the initial state of the reduction.
+  /// Any number of lanes (>=1) could be active at any position.  The reduction
+  /// result is returned in the first active lane.
+  ///
+  /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
+  ///
+  /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
+  /// if (lane_id % 2 == 0 && offset > 0)
+  ///     reduce_elem = reduce_elem REDUCE_OP remote_elem
+  /// else
+  ///     reduce_elem = remote_elem
+  ///
+  ///
+  /// Intra-Team Reduction
+  ///
+  /// This function, as implemented in the runtime call
+  /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
+  /// threads in a team.  It first reduces within a warp using the
+  /// aforementioned algorithms.  We then proceed to gather all such
+  /// reduced values at the first warp.
+  ///
+  /// The runtime makes use of the function 'InterWarpCpyFn', which copies
+  /// data from each of the "warp master" (zeroth lane of each warp, where
+  /// warp-reduced data is held) to the zeroth warp.  This step reduces (in
+  /// a mathematical sense) the problem of reduction across warp masters in
+  /// a block to the problem of warp reduction.
+  ///
+  ///
+  /// Inter-Team Reduction
+  ///
+  /// Once a team has reduced its data to a single value, it is stored in
+  /// a global scratchpad array.  Since each team has a distinct slot, this
+  /// can be done without locking.
+  ///
+  /// The last team to write to the scratchpad array proceeds to reduce the
+  /// scratchpad array.  One or more workers in the last team use the helper
+  /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
+  /// the k'th worker reduces every k'th element.
+  ///
+  /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
+  /// reduce across workers and compute a globally reduced value.
+  ///
   /// \param Loc                The location where the reduction was
   ///                           encountered. Must be within the associate
   ///                           directive and after the last local access to the
   ///                           reduction variables.
   /// \param AllocaIP           An insertion point suitable for allocas usable
   ///                           in reductions.
+  /// \param AllocaIP           An insertion point suitable for code generation.
   /// \param ReductionInfos     A list of info on each reduction variable.
-  /// \param IsNoWait           A flag set if the reduction is marked as nowait.
-  InsertPointTy createReductionsGPU(const LocationDescription &Loc,
-                                    InsertPointTy AllocaIP,
-                                    ArrayRef<ReductionInfo> ReductionInfos,
-                                    bool IsNoWait = false,
-                                    bool IsTeamsReduction = false,
-                                    bool HasDistribute = false);
+  /// \param IsNoWait           Optional flag set if the reduction is marked as
+  ///                           nowait.
+  /// \param IsTeamsReduction   Optional flag set if it is a teams
+  ///                           reduction.
+  /// \param HasDistribute      Optional flag set if it is a
+  ///                           distribute reduction.
+  /// \param GridValue          Optional GPU grid value.
+  /// \param GridValue          Optional GPU grid value.
+  /// \param ReductionBufNum    Optional OpenMPCUDAReductionBufNumValue to be
+  /// used for teams reduction.
+  InsertPointTy createReductionsGPU(
+      const LocationDescription &Loc, InsertPointTy AllocaIP,
+      InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+      bool IsNoWait = false, bool IsTeamsReduction = false,
+      bool HasDistribute = false,
+      ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
+      std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
+      Value *SrcLocInfo = nullptr);
 
   // TODO: provide atomic and non-atomic reduction generators for reduction
   // operators defined by the OpenMP specification.
@@ -1433,9 +1962,6 @@ class OpenMPIRBuilder {
                                  bool IsNoWait = false,
                                  bool IsTeamsReduction = false,
                                  bool HasDistribute = false);
-
-  ///}
-
   /// Return the insertion point used by the underlying IRBuilder.
   InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
 
@@ -1509,19 +2035,6 @@ class OpenMPIRBuilder {
                                  Value *NumThreads, Value *HostPtr,
                                  ArrayRef<Value *> KernelArgs);
 
-  /// Generate a barrier runtime call.
-  ///
-  /// \param Loc The location at which the request originated and is fulfilled.
-  /// \param DK The directive which caused the barrier
-  /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
-  /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
-  ///                        should be checked and acted upon.
-  ///
-  /// \returns The insertion point after the barrier.
-  InsertPointTy emitBarrierImpl(const LocationDescription &Loc,
-                                omp::Directive DK, bool ForceSimpleCall,
-                                bool CheckCancelFlag);
-
   /// Generate a flush runtime call.
   ///
   /// \param Loc The location at which the request originated and is fulfilled.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index b220aab6700e20..e3054ab2e5d064 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -147,15 +147,6 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
 
 Function *GLOBAL_ReductionFunc = nullptr;
 
-static uint64_t getTypeSizeInBytes(Module &M, Type *Type) {
-  return divideCeil(M.getDataLayout().getTypeSizeInBits(Type), 8);
-}
-
-static Value *getTypeSizeInBytesValue(IRBuilder<> &Builder, Module &M,
-                                      Type *Type) {
-  return Builder.getInt64(getTypeSizeInBytes(M, Type));
-}
-
 static const omp::GV &getGridValue(const Triple &T, StringRef Features) {
   if (T.isAMDGPU()) {
     if (Features.count("+wavefrontsize64"))
@@ -767,6 +758,12 @@ void OpenMPIRBuilder::finalize(Function *Fn) {
 
   if (!OffloadInfoManager.empty())
     createOffloadEntriesAndInfoMetadata(ErrorReportFn);
+
+  if (Config.EmitLLVMUsed) {
+    std::vector<WeakTrackingVH> LLVMCompilerUsed = {
+        M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
+    emitUsed("llvm.compiler.used", LLVMCompilerUsed);
+  }
 }
 
 OpenMPIRBuilder::~OpenMPIRBuilder() {
@@ -899,16 +896,12 @@ Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
 }
 
 OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive DK,
-                               bool ForceSimpleCall, bool CheckCancelFlag) {
+OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
+                               bool ForceSimpleCall, bool CheckCancelFlag,
+                               Value *ThreadID) {
   if (!updateToLocation(Loc))
     return Loc.IP;
-  return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag);
-}
 
-OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
-                                 bool ForceSimpleCall, bool CheckCancelFlag) {
   // Build call __kmpc_cancel_barrier(loc, thread_id) or
   //            __kmpc_barrier(loc, thread_id);
 
@@ -933,9 +926,11 @@ OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-  Value *Args[] = {
-      getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
-      getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
+  if (!ThreadID)
+    ThreadID = getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
+
+  Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
+                   ThreadID};
 
   // If we are in a cancellable parallel region, barriers are cancellation
   // points.
@@ -2071,219 +2066,273 @@ OpenMPIRBuilder::createSection(const LocationDescription &Loc,
                               /*IsCancellable*/ true);
 }
 
-static Value *getGPUWarpSize(Module &M, OpenMPIRBuilder &OMPBuilder) {
-  return OMPBuilder.Builder.CreateCall(
-      OMPBuilder.getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size),
-      {});
+static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
+  BasicBlock::iterator IT(I);
+  IT++;
+  return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
 }
 
-static Value *getGPUThreadID(Module &M, OpenMPIRBuilder &OMPBuilder) {
-  return OMPBuilder.Builder.CreateCall(
-      OMPBuilder.getOrCreateRuntimeFunction(
-          M, OMPRTL___kmpc_get_hardware_thread_id_in_block),
-      {});
+void OpenMPIRBuilder::emitUsed(StringRef Name,
+                               std::vector<WeakTrackingVH> &List) {
+  if (List.empty())
+    return;
+
+  // Convert List to what ConstantArray needs.
+  SmallVector<Constant *, 8> UsedArray;
+  UsedArray.resize(List.size());
+  for (unsigned I = 0, E = List.size(); I != E; ++I)
+    UsedArray[I] = ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+        cast<Constant>(&*List[I]), Builder.getPtrTy());
+
+  if (UsedArray.empty())
+    return;
+  ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
+
+  auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                ConstantArray::get(ATy, UsedArray), Name);
+
+  GV->setSection("llvm.metadata");
 }
 
-static Value *getGPUNumThreads(Module &M, OpenMPIRBuilder &OMPBuilder) {
-  const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
-  llvm::Function *F = M.getFunction(LocSize);
-  if (!F) {
-    LLVMContext &Ctx = M.getContext();
-    Type *I32Type = Type::getInt32Ty(Ctx);
+Value *OpenMPIRBuilder::getGPUThreadID() {
+  return Builder.CreateCall(
+      getOrCreateRuntimeFunction(M,
+                                 OMPRTL___kmpc_get_hardware_thread_id_in_block),
+      {});
+}
 
-    F = Function::Create(FunctionType::get(I32Type, std::nullopt, false),
-                         GlobalVariable::ExternalLinkage, LocSize, M);
-  }
-  return OMPBuilder.Builder.CreateCall(F, std::nullopt, "nvptx_num_threads");
+Value *OpenMPIRBuilder::getGPUWarpSize() {
+  return Builder.CreateCall(
+      getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
 }
 
-static Value *getNVPTXWarpID(Module &M, OpenMPIRBuilder &OMPIRBuilder) {
-  unsigned LaneIDBits =
-      llvm::Log2_32(OMPIRBuilder.Config.getGridValue().GV_Warp_Size);
-  return OMPIRBuilder.Builder.CreateAShr(getGPUThreadID(M, OMPIRBuilder),
-                                         LaneIDBits, "nvptx_warp_id");
+Value *OpenMPIRBuilder::getNVPTXWarpID() {
+  unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
+  return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
 }
 
-static Value *getNVPTXLaneID(Module &M, OpenMPIRBuilder &OMPIRBuilder) {
-   unsigned LaneIDBits =
-     llvm::Log2_32(OMPIRBuilder.Config.getGridValue().GV_Warp_Size);
+Value *OpenMPIRBuilder::getNVPTXLaneID() {
+  unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
   assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
   unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
-  return OMPIRBuilder.Builder.CreateAnd(
-      getGPUThreadID(M, OMPIRBuilder),
-      OMPIRBuilder.Builder.getInt32(LaneIDMask), "nvptx_lane_id");
+  return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
+                           "nvptx_lane_id");
 }
 
-namespace {
-enum CopyAction : unsigned {
-  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
-  // the warp using shuffle instructions.
-  RemoteLaneToThread,
-  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
-  ThreadCopy,
-};
-} // namespace
-
-struct CopyOptionsTy {
-  llvm::Value *RemoteLaneOffset;
-  llvm::Value *ScratchpadIndex;
-  llvm::Value *ScratchpadWidth;
-};
-
-static Value *castValueToType(Module &M, OpenMPIRBuilder &OMPBuilder,
-                              Value *From, Type *ToType,
-                              OpenMPIRBuilder::InsertPointTy AllocaIP,
-                              const OpenMPIRBuilder::LocationDescription &Loc) {
-  IRBuilder<> &Builder = OMPBuilder.Builder;
+Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
+                                        Type *ToType) {
   Type *FromType = From->getType();
-  uint64_t FromSize =
-      divideCeil(M.getDataLayout().getTypeSizeInBits(FromType), 8);
-  uint64_t ToSize = divideCeil(M.getDataLayout().getTypeSizeInBits(ToType), 8);
+  uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
+  uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
   assert(FromSize > 0 && "From size must be greater than zero");
-  assert(ToSize > 0 && "From size must be greater than zero");
+  assert(ToSize > 0 && "To size must be greater than zero");
   if (FromType == ToType)
     return From;
   if (FromSize == ToSize)
     return Builder.CreateBitCast(From, ToType);
   if (ToType->isIntegerTy() && FromType->isIntegerTy())
-    // FIXME(JAN): Assuming signed integer here, not sure how to find out
-    // if unsigned
     return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
-  OpenMPIRBuilder::InsertPointTy CurIP = Builder.saveIP();
+  InsertPointTy SaveIP = Builder.saveIP();
   Builder.restoreIP(AllocaIP);
-  Value *CastItem = Builder.CreateAlloca(ToType, nullptr, "cast_tmp");
-  Builder.restoreIP(CurIP);
+  Value *CastItem = Builder.CreateAlloca(ToType);
+  Builder.restoreIP(SaveIP);
 
   Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      CastItem, FromType->getPointerTo(), "valcastitem");
+      CastItem, FromType->getPointerTo());
   Builder.CreateStore(From, ValCastItem);
-  return Builder.CreateLoad(ToType, CastItem, "castitemload");
+  return Builder.CreateLoad(ToType, CastItem);
 }
 
-static Value *
-createRuntimeShuffleFunction(Module &M, OpenMPIRBuilder &OMPBuilder,
-                             const OpenMPIRBuilder::LocationDescription &Loc,
-                             OpenMPIRBuilder::InsertPointTy AllocaIP,
-                             Value *Element, Type *ElementType, Value *Offset) {
-  LLVMContext &Ctx = M.getContext();
-  IRBuilder<> &Builder = OMPBuilder.Builder;
-  uint64_t Size =
-      divideCeil(M.getDataLayout().getTypeSizeInBits(ElementType), 8);
+Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
+                                                     Value *Element,
+                                                     Type *ElementType,
+                                                     Value *Offset) {
+  uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
   assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
-  Function *ShuffleFunc = OMPBuilder.getOrCreateRuntimeFunctionPtr(
+
+  // Cast all types to 32- or 64-bit values before calling shuffle routines.
+  Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
+  Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
+  Value *WarpSize =
+      Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
+  Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
       Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
                 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
-  Type *IntType = Builder.getIntNTy(Size <= 4 ? 32 : 64);
-  Value *ElemCast = Builder.CreateCast(Instruction::SExt, Element, IntType);
-  Value *WarpSize = getGPUWarpSize(M, OMPBuilder);
   Value *WarpSizeCast =
-      Builder.CreateIntCast(WarpSize, Type::getInt16Ty(Ctx), /*isSigned=*/true);
+      Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
   Value *ShuffleCall =
       Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
-  return castValueToType(M, OMPBuilder, ShuffleCall, IntType, AllocaIP, Loc);
-}
-
-static void shuffleAndStore(Value *SrcAddr, Value *DstAddr, Type *ElementType,
-                            llvm::Value *Offset, Type *ReductionArrayTy,
-                            const OpenMPIRBuilder::LocationDescription &Loc,
-                            Module &M, OpenMPIRBuilder &OMPBuilder,
-                            OpenMPIRBuilder::InsertPointTy AllocaIP) {
-  LLVMContext &Ctx = M.getContext();
-  IRBuilder<> &Builder = OMPBuilder.Builder;
-  uint64_t Size =
-      divideCeil(M.getDataLayout().getTypeSizeInBits(ElementType), 8);
-  Type *PtrTy = PointerType::getUnqual(Ctx);
+  return castValueToType(AllocaIP, ShuffleCall, CastTy);
+}
+
+void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
+                                      Value *DstAddr, Type *ElemType,
+                                      Value *Offset, Type *ReductionArrayTy) {
+  uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
+  // Create the loop over the big sized data.
+  // ptr = (void*)Elem;
+  // ptrEnd = (void*) Elem + 1;
+  // Step = 8;
+  // while (ptr + Step < ptrEnd)
+  //   shuffle((int64_t)*ptr);
+  // Step = 4;
+  // while (ptr + Step < ptrEnd)
+  //   shuffle((int32_t)*ptr);
+  // ...
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
   Value *ElemPtr = DstAddr;
   Value *Ptr = SrcAddr;
-  //  Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
-  // Builder.CreateConstGEP1_64(ReductionArrayTy, SrcAddr, 1), PtrTy);
-  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
+  for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
     if (Size < IntSize)
       continue;
-    // FIXME(JAN): Check if there is a function to convert from bytes to bits
-    Type *IntTy = Builder.getIntNTy(IntSize * 8);
+    Type *IntType = Builder.getIntNTy(IntSize * 8);
     Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
-        Ptr, IntTy->getPointerTo(), "ptrcast");
+        Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
+    Value *SrcAddrGEP =
+        Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
     ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtr, IntTy->getPointerTo(), "elemptrcast");
-
-    // FIXME(JAN): Implement loop to handle larger size
-    assert(((Size / IntSize) <= 1) && "Unsupported IntSize");
-    Value *Val = Builder.CreateLoad(IntTy, Ptr);
-    Value *Res = createRuntimeShuffleFunction(M, OMPBuilder, Loc, AllocaIP, Val,
-                                              IntTy, Offset);
-    Builder.CreateStore(Res, ElemPtr);
-    Ptr = Builder.CreateConstGEP1_64(ReductionArrayTy, Ptr, 1, "ptrgep");
-    ElemPtr =
-        Builder.CreateConstGEP1_64(ReductionArrayTy, ElemPtr, 1, "elemptrgep");
+        ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
+
+    Function *CurFunc = Builder.GetInsertBlock()->getParent();
+    if ((Size / IntSize) > 1) {
+      Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
+          SrcAddrGEP, Builder.getPtrTy());
+      BasicBlock *PreCondBB =
+          BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
+      BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
+      BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
+      BasicBlock *CurrentBB = Builder.GetInsertBlock();
+      emitBlock(PreCondBB, CurFunc);
+      PHINode *PhiSrc =
+          Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
+      PhiSrc->addIncoming(Ptr, CurrentBB);
+      PHINode *PhiDest =
+          Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
+      PhiDest->addIncoming(ElemPtr, CurrentBB);
+      Ptr = PhiSrc;
+      ElemPtr = PhiDest;
+      Value *PtrDiff = Builder.CreatePtrDiff(
+          Builder.getInt8Ty(), PtrEnd,
+          Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
+      Builder.CreateCondBr(
+          Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
+          ExitBB);
+      emitBlock(ThenBB, CurFunc);
+      Value *Res = createRuntimeShuffleFunction(
+          AllocaIP,
+          Builder.CreateAlignedLoad(
+              IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
+          IntType, Offset);
+      Builder.CreateAlignedStore(Res, ElemPtr,
+                                 M.getDataLayout().getPrefTypeAlign(ElemType));
+      Value *LocalPtr =
+          Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+      Value *LocalElemPtr =
+          Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+      PhiSrc->addIncoming(LocalPtr, ThenBB);
+      PhiDest->addIncoming(LocalElemPtr, ThenBB);
+      emitBranch(PreCondBB);
+      emitBlock(ExitBB, CurFunc);
+    } else {
+      Value *Res = createRuntimeShuffleFunction(
+          AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
+      if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
+                                         Res->getType()->getScalarSizeInBits())
+        Res = Builder.CreateTrunc(Res, ElemType);
+      Builder.CreateStore(Res, ElemPtr);
+      Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
+      ElemPtr =
+          Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
+    }
     Size = Size % IntSize;
   }
 }
 
-static void
-emitReductionListCopy(CopyAction Action, Type *ReductionArrayTy,
-                      ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
-                      Value *SrcBase, Value *DestBase, Module &M,
-                      OpenMPIRBuilder &OMPBuilder,
-                      const OpenMPIRBuilder::LocationDescription &Loc,
-                      OpenMPIRBuilder::InsertPointTy AllocaIP,
-                      CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
-  LLVMContext &Ctx = M.getContext();
-  IRBuilder<> &Builder = OMPBuilder.Builder;
-  Type *PtrTy = PointerType::getUnqual(Ctx);
-
+void OpenMPIRBuilder::emitReductionListCopy(
+    InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
+    ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
+    CopyOptionsTy CopyOptions) {
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
   Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
 
+  // Iterates, element-by-element, through the source Reduce list and
+  // make a copy.
   for (auto En : enumerate(ReductionInfos)) {
-    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    const ReductionInfo &RI = En.value();
     Value *SrcElementAddr = nullptr;
     Value *DestElementAddr = nullptr;
     Value *DestElementPtrAddr = nullptr;
+    // Should we shuffle in an element from a remote lane?
     bool ShuffleInElement = false;
+    // Set to true to update the pointer in the dest Reduce list to a
+    // newly created element.
     bool UpdateDestListPtr = false;
 
     // Step 1.1: Get the address for the src element in the Reduce list.
-    Value *SrcElementPtrAddr = Builder.CreateConstGEP2_64(
-        ReductionArrayTy, SrcBase, 0, En.index(), "srcelementptraddr");
-    SrcElementAddr =
-        Builder.CreateLoad(PtrTy, SrcElementPtrAddr, "srcelementaddr");
+    Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
+        ReductionArrayTy, SrcBase,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
 
     // Step 1.2: Create a temporary to store the element in the destination
     // Reduce list.
     DestElementPtrAddr = Builder.CreateInBoundsGEP(
         ReductionArrayTy, DestBase,
-        {Builder.getInt64(0), Builder.getInt64(En.index())},
-        "destelementptraddr");
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
     switch (Action) {
-    case RemoteLaneToThread: {
-      OpenMPIRBuilder::InsertPointTy CurIP = Builder.saveIP();
+    case CopyAction::RemoteLaneToThread: {
+      InsertPointTy CurIP = Builder.saveIP();
       Builder.restoreIP(AllocaIP);
-      DestElementAddr = Builder.CreateAlloca(RI.ElementType, nullptr,
-                                             ".omp.reduction.element");
+      AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
+                                                    ".omp.reduction.element");
+      DestAlloca->setAlignment(
+          M.getDataLayout().getPrefTypeAlign(RI.ElementType));
+      DestElementAddr = DestAlloca;
+      DestElementAddr =
+          Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
+                                      DestElementAddr->getName() + ".ascast");
       Builder.restoreIP(CurIP);
       ShuffleInElement = true;
       UpdateDestListPtr = true;
       break;
     }
-    case ThreadCopy: {
+    case CopyAction::ThreadCopy: {
       DestElementAddr =
-          Builder.CreateLoad(PtrTy, DestElementPtrAddr, "destelementaddr");
+          Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
       break;
     }
     }
 
-    // FIXME(JAN): Original code in clanguses <Addr>.withElementType(...)
-    // check if this generates any code
-
+    // Now that all active lanes have read the element in the
+    // Reduce list, shuffle over the value from the remote lane.
     if (ShuffleInElement) {
-      shuffleAndStore(SrcElementAddr, DestElementAddr, RI.ElementType,
-                      RemoteLaneOffset, ReductionArrayTy, Loc, M, OMPBuilder,
-                      AllocaIP);
+      shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
+                      RemoteLaneOffset, ReductionArrayTy);
     } else {
-      // FIXME(JAN): Assume Scalar here (TEK_Scalar in Clang)
-      Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
-      Builder.CreateStore(Elem, DestElementAddr);
+      switch (RI.EvaluationKind) {
+      case EvaluationKindTy::Scalar: {
+        Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
+        // Store the source element value to the dest element address.
+        Builder.CreateStore(Elem, DestElementAddr);
+        break;
+      }
+      case EvaluationKindTy::Complex: {
+        break;
+      }
+      case EvaluationKindTy::Aggregate: {
+        Value *SizeVal = Builder.getInt64(
+            M.getDataLayout().getTypeStoreSize(RI.ElementType));
+        Builder.CreateMemCpy(
+            DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+            SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+            SizeVal, false);
+        break;
+      }
+      };
     }
+
     // Step 3.1: Modify reference in dest Reduce list as needed.
     // Modifying the reference in Reduce list to point to the newly
     // created element.  The element is live in the current function
@@ -2291,443 +2340,500 @@ emitReductionListCopy(CopyAction Action, Type *ReductionArrayTy,
     // RemoteReduceData[i] = (void*)&RemoteElem
     if (UpdateDestListPtr) {
       Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
-          DestElementAddr, PtrTy, "castdestaddr");
+          DestElementAddr, Builder.getPtrTy(),
+          DestElementAddr->getName() + ".ascast");
       Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
     }
   }
 }
 
-static OpenMPIRBuilder::InsertPointTy getIPAfterInstr(Instruction *I) {
-  BasicBlock::iterator it(I);
-  it++;
-  return OpenMPIRBuilder::InsertPointTy(I->getParent(), it);
-}
-
-static Function *emitShuffleAndReduceFunction(
-    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
-    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
-    OpenMPIRBuilder &OMPBuilder) {
-  IRBuilder<> &Builder = OMPBuilder.Builder;
-
-  LLVMContext &Ctx = M.getContext();
-  Type *VoidTy = Type::getVoidTy(Ctx);
-  Type *PtrTy = PointerType::getUnqual(Ctx);
-  Type *I16Type = Type::getInt16Ty(Ctx);
-  auto FuncTy = FunctionType::get(VoidTy, {PtrTy, I16Type, I16Type, I16Type},
-                                  /* IsVarArg */ false);
-  Function *SarFunc =
-      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
-                       "_omp_reduction_shuffle_and_reduce_func", &M);
-  SarFunc->setDoesNotRecurse();
-
-  // Set arg names
-  Argument *Arg0 = SarFunc->getArg(0);
-  Argument *Arg1 = SarFunc->getArg(1);
-  Argument *Arg2 = SarFunc->getArg(2);
-  Argument *Arg3 = SarFunc->getArg(3);
-  Arg0->setName("reduce_list_arg");
-  Arg1->setName("lane_id_arg");
-  Arg2->setName("remote_lane_offset_arg");
-  Arg3->setName("algo_ver_arg");
-
-  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", SarFunc);
-  Builder.SetInsertPoint(EntryBlock);
-
-  Type *Arg0Type = Arg0->getType();
-  Type *ArgNType = Arg1->getType();
-  Type *ArgNPtrType = Arg1->getType()->getPointerTo();
-  Value *ReduceListAlloca =
-      Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
-  Value *LaneIdAlloca =
-      Builder.CreateAlloca(ArgNType, nullptr, Arg1->getName() + ".addr");
-  Value *RemoteLaneOffsetAlloca =
-      Builder.CreateAlloca(ArgNType, nullptr, Arg2->getName() + ".addr");
-  Value *AlgoVerAlloca =
-      Builder.CreateAlloca(ArgNType, nullptr, Arg3->getName() + ".addr");
-  // FIXME(Jan): Compute reduction list array type
-  auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
-  Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
-      RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
-
-  Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".acast");
-  Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      LaneIdAlloca, ArgNPtrType, LaneIdAlloca->getName() + ".acast");
-  Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      RemoteLaneOffsetAlloca, ArgNPtrType,
-      RemoteLaneOffsetAlloca->getName() + ".acast");
-  Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      AlgoVerAlloca, ArgNPtrType, AlgoVerAlloca->getName() + ".acast");
-  Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      RemoteReductionListAlloca, PtrTy,
-      RemoteReductionListAlloca->getName() + ".acast");
-
-  Builder.CreateStore(Arg0, ReduceListAddrCast);
-  Builder.CreateStore(Arg1, LaneIdAddrCast);
-  Builder.CreateStore(Arg2, RemoteLaneOffsetAddrCast);
-  Builder.CreateStore(Arg3, AlgoVerAddrCast);
-
-  Value *ReduceList =
-      Builder.CreateLoad(Arg0Type, ReduceListAddrCast, "reduce_list");
-  Value *LaneId = Builder.CreateLoad(ArgNType, LaneIdAddrCast, "lane_id");
-  Value *RemoteLaneOffset = Builder.CreateLoad(
-      ArgNType, RemoteLaneOffsetAddrCast, "remote_lane_offset");
-  Value *AlgoVer = Builder.CreateLoad(ArgNType, AlgoVerAddrCast, "algo_ver");
-
-  OpenMPIRBuilder::InsertPointTy AllocaIP =
-      getIPAfterInstr(RemoteReductionListAlloca);
-  emitReductionListCopy(RemoteLaneToThread, RedListArrayTy, ReductionInfos,
-                        ReduceList, RemoteListAddrCast, M, OMPBuilder, Loc,
-                        AllocaIP, {RemoteLaneOffset, nullptr, nullptr});
-
-  // The actions to be performed on the Remote Reduce list is dependent
-  // on the algorithm version.
-  //
-  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
-  //  LaneId % 2 == 0 && Offset > 0):
-  //    do the reduction value aggregation
-  //
-  //  The thread local variable Reduce list is mutated in place to host the
-  //  reduced data, which is the aggregated value produced from local and
-  //  remote lanes.
-  //
-  //  Note that AlgoVer is expected to be a constant integer known at compile
-  //  time.
-  //  When AlgoVer==0, the first conjunction evaluates to true, making
-  //    the entire predicate true during compile time.
-  //  When AlgoVer==1, the second conjunction has only the second part to be
-  //    evaluated during runtime.  Other conjunctions evaluates to false
-  //    during compile time.
-  //  When AlgoVer==2, the third conjunction has only the second part to be
-  //    evaluated during runtime.  Other conjunctions evaluates to false
-  //    during compile time.
-  Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
-  Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
-  Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
-  Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
-  Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
-  Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
-  Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
-  Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
-  Value *RemoteOffsetComp =
-      Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
-  Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
-  Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
-  Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
-
-  BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then", SarFunc);
-  BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else", SarFunc);
-  BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont", SarFunc);
-
-  Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
-  Builder.SetInsertPoint(ThenBB);
-  // reduce_function(LocalReduceList, RemoteReduceList)
-  Value *LocalReduceListPtr =
-      Builder.CreatePointerBitCastOrAddrSpaceCast(ReduceList, PtrTy);
-  Value *RemoteReduceListPtr =
-      Builder.CreatePointerBitCastOrAddrSpaceCast(RemoteListAddrCast, PtrTy);
-  Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
-  Builder.CreateBr(MergeBB);
-  Builder.SetInsertPoint(ElseBB);
-  Builder.CreateBr(MergeBB);
-  Builder.SetInsertPoint(MergeBB);
-
-  Value *Algo1_2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
-  Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
-  Value *CondCopy = Builder.CreateAnd(Algo1_2, LaneIdGtOffset);
-
-  BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "cpy_then", SarFunc);
-  BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "cpy_else", SarFunc);
-  BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "cpy_ifcont", SarFunc);
-
-  Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
-
-  Builder.SetInsertPoint(CpyThenBB);
-  emitReductionListCopy(ThreadCopy, RedListArrayTy, ReductionInfos,
-                        RemoteListAddrCast, ReduceList, M, OMPBuilder, Loc,
-                        AllocaIP);
-  Builder.CreateBr(CpyMergeBB);
-  Builder.SetInsertPoint(CpyElseBB);
-  Builder.CreateBr(CpyMergeBB);
-  Builder.SetInsertPoint(CpyMergeBB);
-  Builder.CreateRetVoid();
-
-  return SarFunc;
-}
-
-static Function *emitInterWarpCopyFunction(
-    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
-    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
-    OpenMPIRBuilder &OMPBuilder) {
-  IRBuilder<> &Builder = OMPBuilder.Builder;
-  OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
+Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
+    const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
+    AttributeList FuncAttrs) {
+  InsertPointTy SavedIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
-  Type *VoidTy = Type::getVoidTy(Ctx);
-  Type *PtrTy = PointerType::getUnqual(Ctx);
-  Type *I32Type = Type::getInt32Ty(Ctx);
-  auto FuncTy =
-      FunctionType::get(VoidTy, {PtrTy, I32Type}, /* IsVarArg */ false);
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
+      /* IsVarArg */ false);
   Function *WcFunc =
       Function::Create(FuncTy, GlobalVariable::InternalLinkage,
                        "_omp_reduction_inter_warp_copy_func", &M);
-  WcFunc->setDoesNotRecurse();
-
-  // Set arg names
-  Argument *Arg0 = WcFunc->getArg(0);
-  Argument *Arg1 = WcFunc->getArg(1);
-  Arg0->setName("reduce_list");
-  Arg1->setName("num_warps");
-
-  // Ensure data transfer storage
-  unsigned WarpSize = OMPBuilder.Config.getGridValue().GV_Warp_Size;
-  // FIXME(Jan): Not sure about the array type here, but it is I32 in Clang
-  auto *ArrayTy = ArrayType::get(I32Type, WarpSize);
+  WcFunc->setAttributes(FuncAttrs);
+  WcFunc->addParamAttr(0, Attribute::NoUndef);
+  WcFunc->addParamAttr(1, Attribute::NoUndef);
+  BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
+  Builder.SetInsertPoint(EntryBB);
+
+  // ReduceList: thread local Reduce list.
+  // At the stage of the computation when this function is called, partially
+  // aggregated values reside in the first lane of every active warp.
+  Argument *ReduceListArg = WcFunc->getArg(0);
+  // NumWarps: number of warps active in the parallel region.  This could
+  // be smaller than 32 (max warps in a CTA) for partial block reduction.
+  Argument *NumWarpsArg = WcFunc->getArg(1);
+
+  // This array is used as a medium to transfer, one reduce element at a time,
+  // the data from the first lane of every warp to lanes in the first warp
+  // in order to perform the final step of a reduction in a parallel region
+  // (reduction across warps).  The array is placed in NVPTX __shared__ memory
+  // for reduced latency, as well as to have a distinct copy for concurrently
+  // executing target regions.  The array is declared with common linkage so
+  // as to be shared across compilation units.
   StringRef TransferMediumName =
       "__openmp_nvptx_data_transfer_temporary_storage";
   GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
+  unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
+  ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
   if (!TransferMedium) {
-    unsigned SharedAddressSpace =
-        3; /* FIXME(Jan): C.getTargetAddressSpace(LangAS::cuda_shared); */
     TransferMedium = new GlobalVariable(
         M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
         UndefValue::get(ArrayTy), TransferMediumName,
         /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
+        /*AddressSpace=*/3);
   }
 
-  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", WcFunc);
-  Builder.SetInsertPoint(EntryBlock);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
 
-  Type *Arg0Type = Arg0->getType();
-  Type *Arg1Type = Arg1->getType();
-  Value *ReduceListAlloca =
-      Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
-  Instruction *NumWarpsAlloca =
-      Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+  // Get the CUDA thread id of the current OpenMP thread on the GPU.
+  Value *GPUThreadID = getGPUThreadID();
+  // nvptx_lane_id = nvptx_id % warpsize
+  Value *LaneID = getNVPTXLaneID();
+  // nvptx_warp_id = nvptx_id / warpsize
+  Value *WarpID = getNVPTXWarpID();
+
+  InsertPointTy AllocaIP =
+      InsertPointTy(Builder.GetInsertBlock(),
+                    Builder.GetInsertBlock()->getFirstInsertionPt());
+  Type *Arg0Type = ReduceListArg->getType();
+  Type *Arg1Type = NumWarpsArg->getType();
+  Builder.restoreIP(AllocaIP);
+  AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
+      Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
+  AllocaInst *NumWarpsAlloca =
+      Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
+  Value *ThreadID =
+      getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
   Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".acast");
+      ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
   Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
       NumWarpsAlloca, Arg1Type->getPointerTo(),
-      NumWarpsAlloca->getName() + ".acast");
-  Builder.CreateStore(Arg0, ReduceListAddrCast);
-  Builder.CreateStore(Arg1, NumWarpsAddrCast);
-
-  // Get GPU Info
-  Value *ThreadID = getGPUThreadID(M, OMPBuilder);
-  Value *LaneID = getNVPTXLaneID(M, OMPBuilder);
-  Value *WarpID = getNVPTXWarpID(M, OMPBuilder);
+      NumWarpsAlloca->getName() + ".ascast");
+  Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+  Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
+  AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
+  InsertPointTy CodeGenIP =
+      getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
+  Builder.restoreIP(CodeGenIP);
 
-  Value *ReduceListArg =
-      Builder.CreateLoad(PtrTy, ReduceListAddrCast, "reduce_list_arg");
+  Value *ReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
 
   for (auto En : enumerate(ReductionInfos)) {
-    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
-    Type *ElementTy = RI.ElementType;
-    unsigned NumTypeBits = M.getDataLayout().getTypeSizeInBits(ElementTy);
-    unsigned RealTySize = divideCeil(NumTypeBits, 8);
+    //
+    // Warp master copies reduce element to transfer medium in __shared__
+    // memory.
+    //
+    const ReductionInfo &RI = En.value();
+    unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
     for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
+      Type *CType = Builder.getIntNTy(TySize * 8);
+
       unsigned NumIters = RealTySize / TySize;
       if (NumIters == 0)
         continue;
-      //      Type *CopyTy = Builder.getIntNTy(TySize);
-      Type *Int32Ty = Builder.getInt32Ty();
       Value *Cnt = nullptr;
-      Value *CntAddrAcast = nullptr;
+      Value *CntAddr = nullptr;
       BasicBlock *PrecondBB = nullptr;
       BasicBlock *ExitBB = nullptr;
-
       if (NumIters > 1) {
-        OpenMPIRBuilder::InsertPointTy CurrIP = Builder.saveIP();
-        Builder.SetInsertPoint(NumWarpsAlloca);
-        Value *CntAddr = Builder.CreateAlloca(Int32Ty, nullptr, ".cnt.addr");
-        Builder.restoreIP(CurrIP);
-        CntAddrAcast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-            CntAddr, PtrTy, CntAddr->getName() + ".acast");
-        Builder.CreateStore(Constant::getNullValue(Int32Ty), CntAddrAcast);
-        PrecondBB = BasicBlock::Create(Ctx, "precond", WcFunc);
-        ExitBB = BasicBlock::Create(Ctx, "exit", WcFunc);
-        BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body", WcFunc);
-        Builder.CreateBr(PrecondBB);
-        Builder.SetInsertPoint(PrecondBB);
-        Cnt = Builder.CreateLoad(Int32Ty, CntAddrAcast, "cnt");
-        Value *Cmp = Builder.CreateICmpULT(Cnt, Builder.getInt32(NumIters));
+        CodeGenIP = Builder.saveIP();
+        Builder.restoreIP(AllocaIP);
+        CntAddr =
+            Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
+
+        CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
+                                              CntAddr->getName() + ".ascast");
+        Builder.restoreIP(CodeGenIP);
+        Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
+                            CntAddr,
+                            /*Volatile=*/false);
+        PrecondBB = BasicBlock::Create(Ctx, "precond");
+        ExitBB = BasicBlock::Create(Ctx, "exit");
+        BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
+        emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
+        Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
+                                 /*Volatile=*/false);
+        Value *Cmp = Builder.CreateICmpULT(
+            Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
         Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
-        Builder.SetInsertPoint(BodyBB);
+        emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
       }
 
-      OMPBuilder.createBarrier(
-          OpenMPIRBuilder::LocationDescription(Builder.saveIP(), Loc.DL),
-          omp::Directive::OMPD_unknown,
-          /* ForceSimpleCall */ false,
-          /* CheckCancelFlag */ true);
-      BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then", WcFunc);
-      BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else", WcFunc);
-      BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont", WcFunc);
+      // kmpc_barrier.
+      createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+                    omp::Directive::OMPD_unknown,
+                    /* ForceSimpleCall */ false,
+                    /* CheckCancelFlag */ true, ThreadID);
+      BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+      BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+      BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
 
       // if (lane_id  == 0)
       Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
       Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
+      emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
 
-      // then
       // Reduce element = LocalReduceList[i]
-      Builder.SetInsertPoint(ThenBB);
-      // FIXME(JAN): Should array type be passed in?
-      auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
-      // FIXME(JAN): maybe it should be 0,0 and not use En.index()
-      Value *ReduceListElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
-          RedListArrayTy, ReduceListArg, 0, En.index());
-      Value *ReduceListElementPtr = Builder.CreateLoad(
-          PtrTy, ReduceListElementPtrPtr, "reduce_list_element_ptr");
+      auto *RedListArrayTy =
+          ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+      Type *IndexTy = Builder.getIndexTy(
+          M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+      Value *ElemPtrPtr =
+          Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+                                    {ConstantInt::get(IndexTy, 0),
+                                     ConstantInt::get(IndexTy, En.index())});
+      // elemptr = ((CopyType*)(elemptrptr)) + I
+      Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
       if (NumIters > 1)
-        ReduceListElementPtr =
-            Builder.CreateGEP(Int32Ty, ReduceListElementPtr, Cnt);
+        ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
 
-      Value *TransferElemAddr = Builder.CreateInBoundsGEP(
+      // Get pointer to location in transfer medium.
+      // MediumPtr = &medium[warp_id]
+      Value *MediumPtr = Builder.CreateInBoundsGEP(
           ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
-      Value *ReduceListElement = Builder.CreateLoad(
-          I32Type, ReduceListElementPtr, "reduce_list_element");
-      Builder.CreateStore(ReduceListElement, TransferElemAddr,
+      // elem = *elemptr
+      //*MediumPtr = elem
+      Value *Elem = Builder.CreateLoad(CType, ElemPtr);
+      // Store the source element value to the dest element address.
+      Builder.CreateStore(Elem, MediumPtr,
                           /*IsVolatile*/ true);
       Builder.CreateBr(MergeBB);
 
       // else
-      Builder.SetInsertPoint(ElseBB);
+      emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
       Builder.CreateBr(MergeBB);
 
       // endif
-      Builder.SetInsertPoint(MergeBB);
-      OMPBuilder.createBarrier(
-          OpenMPIRBuilder::LocationDescription(Builder.saveIP(), Loc.DL),
-          omp::Directive::OMPD_unknown,
-          /* ForceSimpleCall */ false,
-          /* CheckCancelFlag */ true);
+      emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+      createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
+                    omp::Directive::OMPD_unknown,
+                    /* ForceSimpleCall */ false,
+                    /* CheckCancelFlag */ true, ThreadID);
 
       // Warp 0 copies reduce element from transfer medium
-      BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "w0then", WcFunc);
-      BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "w0else", WcFunc);
-      BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "w0ifcont", WcFunc);
+      BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
+      BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
+      BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
 
       Value *NumWarpsVal =
-          Builder.CreateLoad(I32Type, NumWarpsAddrCast, "num_warps");
+          Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
+      // Up to 32 threads in warp 0 are active.
       Value *IsActiveThread =
-          Builder.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
+          Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
       Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
 
-      // W0then
-      // SecMEdiumPtr = &medium[tid]
-      Builder.SetInsertPoint(W0ThenBB);
-      Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
-          ArrayTy, TransferMedium, {Builder.getInt64(0), ThreadID});
+      emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
+
+      // SecMediumPtr = &medium[tid]
       // SrcMediumVal = *SrcMediumPtr
-      // TODO(JAN): Bitcast here, but no load? skipping for now
-      Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
-          RedListArrayTy, ReduceListArg, 0, En.index());
-      Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+      Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
+          ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
+      // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
+      Value *TargetElemPtrPtr =
+          Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
+                                    {ConstantInt::get(IndexTy, 0),
+                                     ConstantInt::get(IndexTy, En.index())});
+      Value *TargetElemPtrVal =
+          Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
+      Value *TargetElemPtr = TargetElemPtrVal;
       if (NumIters > 1)
-        TargetElementPtr = Builder.CreateGEP(Int32Ty, TargetElementPtr, Cnt);
+        TargetElemPtr =
+            Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
 
+      // *TargetElemPtr = SrcMediumVal;
       Value *SrcMediumValue =
-          Builder.CreateLoad(I32Type, SrcMediumPtrVal, /*IsVolatile*/ true);
-      Builder.CreateStore(SrcMediumValue, TargetElementPtr);
+          Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
+      Builder.CreateStore(SrcMediumValue, TargetElemPtr);
       Builder.CreateBr(W0MergeBB);
 
-      // W0else
-      Builder.SetInsertPoint(W0ElseBB);
+      emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
       Builder.CreateBr(W0MergeBB);
 
-      // W0endif
-      Builder.SetInsertPoint(W0MergeBB);
+      emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
+
       if (NumIters > 1) {
-        Cnt = Builder.CreateNSWAdd(Cnt, Builder.getInt32(1));
-        Builder.CreateStore(Cnt, CntAddrAcast);
-        Builder.CreateBr(PrecondBB);
-        Builder.SetInsertPoint(ExitBB);
+        Cnt = Builder.CreateNSWAdd(
+            Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
+        Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
+
+        auto *CurFn = Builder.GetInsertBlock()->getParent();
+        emitBranch(PrecondBB);
+        emitBlock(ExitBB, CurFn);
       }
+      RealTySize %= TySize;
     }
   }
 
   Builder.CreateRetVoid();
-  Builder.restoreIP(OldIP);
+  Builder.restoreIP(SavedIP);
+
   return WcFunc;
 }
 
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-///   For all data entries D in reduce_data:
-///     Copy local D to buffer.D[Idx]
-static Function *emitListToGlobalCopyFunction(
-    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
-    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
-    OpenMPIRBuilder &OMPBuilder) {
-  IRBuilder<> &Builder = OMPBuilder.Builder;
+Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+    AttributeList FuncAttrs) {
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FuncTy =
+      FunctionType::get(Builder.getVoidTy(),
+                        {Builder.getPtrTy(), Builder.getInt16Ty(),
+                         Builder.getInt16Ty(), Builder.getInt16Ty()},
+                        /* IsVarArg */ false);
+  Function *SarFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                       "_omp_reduction_shuffle_and_reduce_func", &M);
+  SarFunc->setAttributes(FuncAttrs);
+  SarFunc->addParamAttr(0, Attribute::NoUndef);
+  SarFunc->addParamAttr(1, Attribute::NoUndef);
+  SarFunc->addParamAttr(2, Attribute::NoUndef);
+  SarFunc->addParamAttr(3, Attribute::NoUndef);
+  SarFunc->addParamAttr(1, Attribute::SExt);
+  SarFunc->addParamAttr(2, Attribute::SExt);
+  SarFunc->addParamAttr(3, Attribute::SExt);
+  BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
+  Builder.SetInsertPoint(EntryBB);
+
+  // Thread local Reduce list used to host the values of data to be reduced.
+  Argument *ReduceListArg = SarFunc->getArg(0);
+  // Current lane id; could be logical.
+  Argument *LaneIDArg = SarFunc->getArg(1);
+  // Offset of the remote source lane relative to the current lane.
+  Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
+  // Algorithm version.  This is expected to be known at compile time.
+  Argument *AlgoVerArg = SarFunc->getArg(3);
+
+  Type *ReduceListArgType = ReduceListArg->getType();
+  Type *LaneIDArgType = LaneIDArg->getType();
+  Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
+  Value *ReduceListAlloca = Builder.CreateAlloca(
+      ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
+  Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+                                             LaneIDArg->getName() + ".addr");
+  Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
+      LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
+  Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
+                                              AlgoVerArg->getName() + ".addr");
+  ArrayType *RedListArrayTy =
+      ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+  // Create a local thread-private variable to host the Reduce list
+  // from a remote lane.
+  Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
+      RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
+
+  Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceListAlloca, ReduceListArgType,
+      ReduceListAlloca->getName() + ".ascast");
+  Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
+  Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteLaneOffsetAlloca, LaneIDArgPtrType,
+      RemoteLaneOffsetAlloca->getName() + ".ascast");
+  Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
+  Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteReductionListAlloca, Builder.getPtrTy(),
+      RemoteReductionListAlloca->getName() + ".ascast");
+
+  Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
+  Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
+  Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
+  Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
+
+  Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
+  Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
+  Value *RemoteLaneOffset =
+      Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
+  Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
+
+  InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
+
+  // This loop iterates through the list of reduce elements and copies,
+  // element by element, from a remote lane in the warp to RemoteReduceList,
+  // hosted on the thread's stack.
+  emitReductionListCopy(
+      AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
+      ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
+
+  // The actions to be performed on the Remote Reduce list is dependent
+  // on the algorithm version.
+  //
+  //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
+  //  LaneId % 2 == 0 && Offset > 0):
+  //    do the reduction value aggregation
+  //
+  //  The thread local variable Reduce list is mutated in place to host the
+  //  reduced data, which is the aggregated value produced from local and
+  //  remote lanes.
+  //
+  //  Note that AlgoVer is expected to be a constant integer known at compile
+  //  time.
+  //  When AlgoVer==0, the first conjunction evaluates to true, making
+  //    the entire predicate true during compile time.
+  //  When AlgoVer==1, the second conjunction has only the second part to be
+  //    evaluated during runtime.  Other conjunctions evaluates to false
+  //    during compile time.
+  //  When AlgoVer==2, the third conjunction has only the second part to be
+  //    evaluated during runtime.  Other conjunctions evaluates to false
+  //    during compile time.
+  Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
+  Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+  Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
+  Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
+  Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
+  Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
+  Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
+  Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
+  Value *RemoteOffsetComp =
+      Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
+  Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
+  Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
+  Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
+
+  BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
+  BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
+  BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
+
+  Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
+  emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
+  // reduce_function(LocalReduceList, RemoteReduceList)
+  Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReduceList, Builder.getPtrTy());
+  Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RemoteListAddrCast, Builder.getPtrTy());
+  Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
+      ->addFnAttr(Attribute::NoUnwind);
+  Builder.CreateBr(MergeBB);
+
+  emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
+  Builder.CreateBr(MergeBB);
+
+  emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
+
+  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
+  // Reduce list.
+  Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
+  Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
+  Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
+
+  BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
+  BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
+  BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
+  Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
+
+  emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
+  emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
+                        ReductionInfos, RemoteListAddrCast, ReduceList);
+  Builder.CreateBr(CpyMergeBB);
+
+  emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
+  Builder.CreateBr(CpyMergeBB);
+
+  emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
+
+  Builder.CreateRetVoid();
+
+  return SarFunc;
+}
+
+Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+    AttributeList FuncAttrs) {
   OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
-  Type *VoidTy = Type::getVoidTy(Ctx);
-  Type *Int32Ty = Builder.getInt32Ty();
-  Type *PtrTy = PointerType::getUnqual(Ctx);
-  auto FuncTy =
-      FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
   Function *LtGCFunc =
       Function::Create(FuncTy, GlobalVariable::InternalLinkage,
                        "_omp_reduction_list_to_global_copy_func", &M);
-  LtGCFunc->setDoesNotRecurse();
+  LtGCFunc->setAttributes(FuncAttrs);
+  LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(2, Attribute::NoUndef);
 
-  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGCFunc);
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
   Builder.SetInsertPoint(EntryBlock);
 
-  // Set arg names
-  Argument *Arg0 = LtGCFunc->getArg(0);
-  Argument *Arg1 = LtGCFunc->getArg(1);
-  Argument *Arg2 = LtGCFunc->getArg(2);
-  Arg0->setName("buffer_arg");
-  Arg1->setName("idx_arg");
-  Arg2->setName("reduce_list_arg");
-
-  Value *BufferArgAlloca =
-      Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
-  Value *IdxArgAlloca =
-      Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
-  Value *ReduceListArgAlloca =
-      Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGCFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGCFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
   Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
   Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
   Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
-  // FIXME(JAN): Assume a single globalized variable for now, this should be
-  // passed in
-  Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
-  Type *TypeArgs[] = {SingleReductionTy};
-  StructType *ReductionsBufferTy =
-      StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
-
-  Builder.CreateStore(Arg0, BufferArgAddrCast);
-  Builder.CreateStore(Arg1, IdxArgAddrCast);
-  Builder.CreateStore(Arg2, ReduceListArgAddrCast);
-
-  Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
-  Value *Idxs[] = {
-      Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast, "idxs")};
-  Value *ReduceListArg =
-      Builder.CreateLoad(PtrTy, ReduceListArgAddrCast, "reduce_list");
-  // FIXME(Jan): Assume TEK_SCALAR
-  for (auto En : enumerate(ReductionInfos)) {
-    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
-    // FIXME(Jan): Compute array type
-    auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
-    Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
-        RedListArrayTy, ReduceListArg, 0, En.index());
-    Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
 
+  Value *LocalReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Value *BufferArgVal =
+      Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    auto *RedListArrayTy =
+        ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+    // Reduce element = LocalReduceList[i]
+    Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, LocalReduceList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    // elemptr = ((CopyType*)(elemptrptr)) + I
+    Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+
+    // Global = Buffer.VD[Idx];
     Value *BufferVD =
-        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
-    Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
-        ReductionsBufferTy, BufferVD, 0, En.index());
-    Value *TargetElement = Builder.CreateLoad(RI.ElementType, TargetElementPtr);
-    Builder.CreateStore(TargetElement, GlobValPtr);
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
+    Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
+        ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+
+    switch (RI.EvaluationKind) {
+    case EvaluationKindTy::Scalar: {
+      Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
+      Builder.CreateStore(TargetElement, GlobVal);
+      break;
+    }
+    case EvaluationKindTy::Complex: {
+      break;
+    }
+    case EvaluationKindTy::Aggregate:
+      Value *SizeVal =
+          Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+      Builder.CreateMemCpy(
+          GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
+          M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
+      break;
+    }
   }
 
   Builder.CreateRetVoid();
@@ -2735,287 +2841,287 @@ static Function *emitListToGlobalCopyFunction(
   return LtGCFunc;
 }
 
-/// This function emits a helper that copies all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
-///   For all data entries D in reduce_data:
-///     Copy local D to buffer.D[Idx]
-static Function *emitGlobalToListCopyFunction(
-    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
-    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
-    OpenMPIRBuilder &OMPBuilder) {
-  IRBuilder<> &Builder = OMPBuilder.Builder;
+Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+    Type *ReductionsBufferTy, AttributeList FuncAttrs) {
   OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
-  Type *VoidTy = Type::getVoidTy(Ctx);
-  Type *Int32Ty = Builder.getInt32Ty();
-  Type *PtrTy = PointerType::getUnqual(Ctx);
-  auto FuncTy =
-      FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
-  Function *LtGCFunc =
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGRFunc =
       Function::Create(FuncTy, GlobalVariable::InternalLinkage,
-                       "_omp_reduction_global_to_list_copy_func", &M);
-  LtGCFunc->setDoesNotRecurse();
+                       "_omp_reduction_list_to_global_reduce_func", &M);
+  LtGRFunc->setAttributes(FuncAttrs);
+  LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(2, Attribute::NoUndef);
 
-  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGCFunc);
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
   Builder.SetInsertPoint(EntryBlock);
 
-  // Set arg names
-  Argument *Arg0 = LtGCFunc->getArg(0);
-  Argument *Arg1 = LtGCFunc->getArg(1);
-  Argument *Arg2 = LtGCFunc->getArg(2);
-  Arg0->setName("buffer_arg");
-  Arg1->setName("idx_arg");
-  Arg2->setName("reduce_list_arg");
-
-  Value *BufferArgAlloca =
-      Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
-  Value *IdxArgAlloca =
-      Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
-  Value *ReduceListArgAlloca =
-      Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGRFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGRFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  auto *RedListArrayTy =
+      ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+  // 1. Build a list of reduction variables.
+  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+  Value *LocalReduceList =
+      Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
+
   Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
   Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
   Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
-  // FIXME(JAN): Assume a single globalized variable for now, this should be
-  // passed in
-  Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
-  Type *TypeArgs[] = {SingleReductionTy};
-  StructType *ReductionsBufferTy =
-      StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
-
-  Builder.CreateStore(Arg0, BufferArgAddrCast);
-  Builder.CreateStore(Arg1, IdxArgAddrCast);
-  Builder.CreateStore(Arg2, ReduceListArgAddrCast);
-
-  Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
-  Value *Idxs[] = {
-      Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast, "idxs")};
-  Value *ReduceListArg =
-      Builder.CreateLoad(PtrTy, ReduceListArgAddrCast, "reduce_list");
-  // FIXME(Jan): Assume TEK_SCALAR
-  for (auto En : enumerate(ReductionInfos)) {
-    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
-    // FIXME(Jan): Compute array type
-    auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
-    Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
-        RedListArrayTy, ReduceListArg, 0, En.index());
-    Value *TargetElementPtr = Builder.CreateLoad(PtrTy, TargetElementPtrPtr);
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+  Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LocalReduceList, Builder.getPtrTy(),
+      LocalReduceList->getName() + ".ascast");
 
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  for (auto En : enumerate(ReductionInfos)) {
+    Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, LocalReduceListAddrCast,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
     Value *BufferVD =
-        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
+    // Global = Buffer.VD[Idx];
     Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
-        ReductionsBufferTy, BufferVD, 0, En.index());
-    Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
-    Builder.CreateStore(TargetElement, TargetElementPtr);
+        ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+    Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
   }
 
+  // Call reduce_function(GlobalReduceList, ReduceList)
+  Value *ReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
+      ->addFnAttr(Attribute::NoUnwind);
   Builder.CreateRetVoid();
   Builder.restoreIP(OldIP);
-  return LtGCFunc;
+  return LtGRFunc;
 }
 
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
-///  void *GlobPtrs[];
-///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
-///  ...
-///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
-///  reduce_function(GlobPtrs, reduce_data);
-/// Create a function with a unique name and a "void (i8*, i8*)" signature in
-/// the given module and return it.
-static Function *emitListToGlobalReduceFunction(
-    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
-    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
-    OpenMPIRBuilder &OMPBuilder) {
-  IRBuilder<> &Builder = OMPBuilder.Builder;
+Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
+    AttributeList FuncAttrs) {
   OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
-  Type *VoidTy = Type::getVoidTy(Ctx);
-  Type *Int32Ty = Builder.getInt32Ty();
-  Type *PtrTy = PointerType::getUnqual(Ctx);
-  auto FuncTy =
-      FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
-  Function *LtGRFunc =
+  FunctionType *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
+  Function *LtGCFunc =
       Function::Create(FuncTy, GlobalVariable::InternalLinkage,
-                       "_omp_reduction_list_to_global_reduce_func", &M);
-  LtGRFunc->setDoesNotRecurse();
+                       "_omp_reduction_global_to_list_copy_func", &M);
+  LtGCFunc->setAttributes(FuncAttrs);
+  LtGCFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGCFunc->addParamAttr(2, Attribute::NoUndef);
 
-  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGRFunc);
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
   Builder.SetInsertPoint(EntryBlock);
 
-  // Set arg names
-  Argument *Arg0 = LtGRFunc->getArg(0);
-  Argument *Arg1 = LtGRFunc->getArg(1);
-  Argument *Arg2 = LtGRFunc->getArg(2);
-  Arg0->setName("buffer_arg");
-  Arg1->setName("idx_arg");
-  Arg2->setName("reduce_list_arg");
-
-  Value *BufferArgAlloca =
-      Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
-  Value *IdxArgAlloca =
-      Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
-  Value *ReduceListArgAlloca =
-      Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
-  // FIXME(Jan): Compute array type
-  auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
-  Value *LocalReduceList =
-      Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
-
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGCFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGCFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGCFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
   Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
   Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
   Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
-  Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      LocalReduceList, PtrTy, LocalReduceList->getName() + ".acast");
-  // FIXME(JAN): Assume a single globalized variable for now, this should be
-  // passed in
-  Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
-  Type *TypeArgs[] = {SingleReductionTy};
-  StructType *ReductionsBufferTy =
-      StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
-
-  Builder.CreateStore(Arg0, BufferArgAddrCast);
-  Builder.CreateStore(Arg1, IdxArgAddrCast);
-  Builder.CreateStore(Arg2, ReduceListArgAddrCast);
-
-  Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
-  Value *Idxs[] = {Builder.CreateLoad(Int32Ty, IdxArgAddrCast, "idxs")};
-  // FIXME(Jan): Assume TEK_SCALAR
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *LocalReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
   for (auto En : enumerate(ReductionInfos)) {
     const OpenMPIRBuilder::ReductionInfo &RI = En.value();
-    Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
-        RedListArrayTy, LocalReduceListAddrCast, 0, En.index());
+    auto *RedListArrayTy =
+        ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+    // Reduce element = LocalReduceList[i]
+    Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, LocalReduceList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    // elemptr = ((CopyType*)(elemptrptr)) + I
+    Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
+    // Global = Buffer.VD[Idx];
     Value *BufferVD =
-        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
     Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
-        ReductionsBufferTy, BufferVD, 0, En.index());
-    Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
+        ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
+
+    switch (RI.EvaluationKind) {
+    case EvaluationKindTy::Scalar: {
+      Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
+      Builder.CreateStore(TargetElement, ElemPtr);
+      break;
+    }
+    case EvaluationKindTy::Complex: {
+      // FIXME(Jan): Complex type
+      break;
+    }
+    case EvaluationKindTy::Aggregate:
+      Value *SizeVal =
+          Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
+      Builder.CreateMemCpy(
+          ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+          GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
+          SizeVal, false);
+      break;
+    }
   }
 
-  Value *ReduceList = Builder.CreateLoad(PtrTy, ReduceListArgAddrCast);
-  Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList});
   Builder.CreateRetVoid();
   Builder.restoreIP(OldIP);
-  return LtGRFunc;
+  return LtGCFunc;
 }
 
-/// This function emits a helper that reduces all the reduction variables from
-/// the team into the provided global buffer for the reduction variables.
-///
-/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
-///  void *GlobPtrs[];
-///  GlobPtrs[0] = (void*)&buffer.D0[Idx];
-///  ...
-///  GlobPtrs[N] = (void*)&buffer.DN[Idx];
-///  reduce_function(GlobPtrs, reduce_data);
-/// Create a function with a unique name and a "void (i8*, i8*)" signature in
-/// the given module and return it.
-static Function *emitGlobalToListReduceFunction(
-    Module &M, const OpenMPIRBuilder::LocationDescription &Loc,
-    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos, Function *ReduceFn,
-    OpenMPIRBuilder &OMPBuilder) {
-  IRBuilder<> &Builder = OMPBuilder.Builder;
+Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
+    ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
+    Type *ReductionsBufferTy, AttributeList FuncAttrs) {
   OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
   LLVMContext &Ctx = M.getContext();
-  Type *VoidTy = Type::getVoidTy(Ctx);
-  Type *Int32Ty = Builder.getInt32Ty();
-  Type *PtrTy = PointerType::getUnqual(Ctx);
-  auto FuncTy =
-      FunctionType::get(VoidTy, {PtrTy, Int32Ty, PtrTy}, /* IsVarArg */ false);
+  auto *FuncTy = FunctionType::get(
+      Builder.getVoidTy(),
+      {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
+      /* IsVarArg */ false);
   Function *LtGRFunc =
       Function::Create(FuncTy, GlobalVariable::InternalLinkage,
                        "_omp_reduction_global_to_list_reduce_func", &M);
-  LtGRFunc->setDoesNotRecurse();
+  LtGRFunc->setAttributes(FuncAttrs);
+  LtGRFunc->addParamAttr(0, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(1, Attribute::NoUndef);
+  LtGRFunc->addParamAttr(2, Attribute::NoUndef);
 
-  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "", LtGRFunc);
+  BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
   Builder.SetInsertPoint(EntryBlock);
 
-  // Set arg names
-  Argument *Arg0 = LtGRFunc->getArg(0);
-  Argument *Arg1 = LtGRFunc->getArg(1);
-  Argument *Arg2 = LtGRFunc->getArg(2);
-  Arg0->setName("buffer_arg");
-  Arg1->setName("idx_arg");
-  Arg2->setName("reduce_list_arg");
-
-  Value *BufferArgAlloca =
-      Builder.CreateAlloca(PtrTy, nullptr, Arg0->getName() + ".addr");
-  Value *IdxArgAlloca =
-      Builder.CreateAlloca(Int32Ty, nullptr, Arg1->getName() + ".addr");
-  Value *ReduceListArgAlloca =
-      Builder.CreateAlloca(PtrTy, nullptr, Arg2->getName() + ".addr");
-  // FIXME(Jan): Compute array type
-  auto *RedListArrayTy = ArrayType::get(PtrTy, 1);
+  // Buffer: global reduction buffer.
+  Argument *BufferArg = LtGRFunc->getArg(0);
+  // Idx: index of the buffer.
+  Argument *IdxArg = LtGRFunc->getArg(1);
+  // ReduceList: thread local Reduce list.
+  Argument *ReduceListArg = LtGRFunc->getArg(2);
+
+  Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
+                                                BufferArg->getName() + ".addr");
+  Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
+                                             IdxArg->getName() + ".addr");
+  Value *ReduceListArgAlloca = Builder.CreateAlloca(
+      Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
+  ArrayType *RedListArrayTy =
+      ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+
+  // 1. Build a list of reduction variables.
+  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   Value *LocalReduceList =
       Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
 
   Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      BufferArgAlloca, PtrTy, BufferArgAlloca->getName() + ".acast");
+      BufferArgAlloca, Builder.getPtrTy(),
+      BufferArgAlloca->getName() + ".ascast");
   Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      IdxArgAlloca, PtrTy, IdxArgAlloca->getName() + ".acast");
+      IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
   Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      ReduceListArgAlloca, PtrTy, ReduceListArgAlloca->getName() + ".acast");
-  Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      LocalReduceList, PtrTy, LocalReduceList->getName() + ".acast");
-  // FIXME(JAN): Assume a single globalized variable for now, this should be
-  // passed in
-  Type *SingleReductionTy = ReductionInfos.begin()->ElementType;
-  Type *TypeArgs[] = {SingleReductionTy};
-  StructType *ReductionsBufferTy =
-      StructType::create(Ctx, TypeArgs, "_globalized_locals_ty");
-
-  Builder.CreateStore(Arg0, BufferArgAddrCast);
-  Builder.CreateStore(Arg1, IdxArgAddrCast);
-  Builder.CreateStore(Arg2, ReduceListArgAddrCast);
-
-  Value *BufferArg = Builder.CreateLoad(PtrTy, BufferArgAddrCast, "buffer");
-  Value *Idxs[] = {Builder.CreateLoad(Int32Ty, IdxArgAddrCast, "idxs")};
-  // FIXME(Jan): Assume TEK_SCALAR
+      ReduceListArgAlloca, Builder.getPtrTy(),
+      ReduceListArgAlloca->getName() + ".ascast");
+  Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LocalReduceList, Builder.getPtrTy(),
+      LocalReduceList->getName() + ".ascast");
+
+  Builder.CreateStore(BufferArg, BufferArgAddrCast);
+  Builder.CreateStore(IdxArg, IdxArgAddrCast);
+  Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
+
+  Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
+  Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
   for (auto En : enumerate(ReductionInfos)) {
-    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
-    Value *TargetElementPtrPtr = Builder.CreateConstInBoundsGEP2_64(
-        RedListArrayTy, LocalReduceListAddrCast, 0, En.index());
+    Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
+        RedListArrayTy, ReductionList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    // Global = Buffer.VD[Idx];
     Value *BufferVD =
-        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArg, Idxs);
+        Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
     Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
-        ReductionsBufferTy, BufferVD, 0, En.index());
+        ReductionsBufferTy, BufferVD, 0, En.index(), "sum");
     Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
   }
 
-  Value *ReduceList = Builder.CreateLoad(PtrTy, ReduceListArgAddrCast);
-  Builder.CreateCall(ReduceFn, {ReduceList, LocalReduceListAddrCast});
+  // Call reduce_function(ReduceList, GlobalReduceList)
+  Value *ReduceList =
+      Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
+  Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
+      ->addFnAttr(Attribute::NoUnwind);
   Builder.CreateRetVoid();
   Builder.restoreIP(OldIP);
   return LtGRFunc;
 }
 
-static Function *getFreshReductionFunc(Module &M) {
-  Type *VoidTy = Type::getVoidTy(M.getContext());
-  Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
-  auto *FuncTy =
-      FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
-  return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
-                          ".omp.reduction.func", &M);
-}
+std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
+  std::string Suffix =
+      createPlatformSpecificName({"omp", "reduction", "reduction_func"});
+  return (Name + Suffix).str();
+}
+
+Function *OpenMPIRBuilder::createReductionFunction(
+    StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, bool IsGPU,
+    ReductionGenCBTy ReductionGenCBTy, AttributeList FuncAttrs) {
+  auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
+                                   {Builder.getPtrTy(), Builder.getPtrTy()},
+                                   /* IsVarArg */ false);
+  std::string Name = getReductionFuncName(ReducerName);
+  auto *ReductionFunc =
+      Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M);
+  ReductionFunc->setAttributes(FuncAttrs);
+  ReductionFunc->addParamAttr(0, Attribute::NoUndef);
+  ReductionFunc->addParamAttr(1, Attribute::NoUndef);
+  BasicBlock *EntryBB =
+      BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
+  Builder.SetInsertPoint(EntryBB);
 
-static void populateReductionFunction(
-    Function *ReductionFunc,
-    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
-    IRBuilder<> &Builder, bool IsGPU) {
-  Module *Module = ReductionFunc->getParent();
-  BasicBlock *ReductionFuncBlock =
-      BasicBlock::Create(Module->getContext(), "", ReductionFunc);
-  Builder.SetInsertPoint(ReductionFuncBlock);
   Value *LHSArrayPtr = nullptr;
   Value *RHSArrayPtr = nullptr;
   if (IsGPU) {
@@ -3031,10 +3137,10 @@ static void populateReductionFunction(
         Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
     Value *RHSAlloca =
         Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
-    Value *LHSAddrCast =
-        Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
-    Value *RHSAddrCast =
-        Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
+    Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
+    Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
     Builder.CreateStore(Arg0, LHSAddrCast);
     Builder.CreateStore(Arg1, RHSAddrCast);
     LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
@@ -3044,30 +3150,65 @@ static void populateReductionFunction(
     RHSArrayPtr = ReductionFunc->getArg(1);
   }
 
-  unsigned NumReductions = ReductionInfos.size();
-  Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
-
+  Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
+  SmallVector<Value *> LHSPtrs, RHSPtrs;
   for (auto En : enumerate(ReductionInfos)) {
-    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
-    Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
-        RedArrayTy, LHSArrayPtr, 0, En.index());
-    Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
-    Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
-        LHSI8Ptr, RI.Variable->getType());
-    Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
-    Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
-        RedArrayTy, RHSArrayPtr, 0, En.index());
+    const ReductionInfo &RI = En.value();
+    Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
+        RedArrayTy, RHSArrayPtr,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
     Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
-    Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
-        RHSI8Ptr, RI.PrivateVariable->getType());
-    Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
-    Value *Reduced;
-    Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
-    if (!Builder.GetInsertBlock())
-      return;
-    Builder.CreateStore(Reduced, LHSPtr);
+    Value *RHS = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        RHSI8Ptr, RI.PrivateVariable->getType(),
+        RHSI8Ptr->getName() + ".ascast");
+
+    Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
+        RedArrayTy, LHSArrayPtr,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
+    Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+    Value *LHS = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
+
+    if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+      LHSPtrs.emplace_back(LHS);
+      RHSPtrs.emplace_back(RHS);
+    } else {
+      LHS = Builder.CreateLoad(RI.ElementType, LHS);
+      RHS = Builder.CreateLoad(RI.ElementType, RHS);
+      Value *Reduced;
+      RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
+      if (!Builder.GetInsertBlock())
+        return ReductionFunc;
+      Builder.CreateStore(Reduced, LHS);
+    }
   }
+
+  if (ReductionGenCBTy == ReductionGenCBTy::Clang)
+    for (auto En : enumerate(ReductionInfos)) {
+      unsigned Index = En.index();
+      const ReductionInfo &RI = En.value();
+      Value *LHSFixupPtr, *RHSFixupPtr;
+      Builder.restoreIP(RI.ReductionGenClang(
+          Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
+
+      // Fix the CallBack code genereated to use the correct Values for the LHS
+      // and RHS
+      LHSFixupPtr->replaceUsesWithIf(
+          LHSPtrs[Index], [ReductionFunc](const Use &U) {
+            return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+                   ReductionFunc;
+          });
+      RHSFixupPtr->replaceUsesWithIf(
+          RHSPtrs[Index], [ReductionFunc](const Use &U) {
+            return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+                   ReductionFunc;
+          });
+    }
+
   Builder.CreateRetVoid();
+  return ReductionFunc;
 }
 
 static void
@@ -3077,7 +3218,8 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
     (void)RI;
     assert(RI.Variable && "expected non-null variable");
     assert(RI.PrivateVariable && "expected non-null private variable");
-    assert(RI.ReductionGen && "expected non-null reduction generator callback");
+    assert((RI.ReductionGen || RI.ReductionGenClang) &&
+           "expected non-null reduction generator callback");
     // JAN: Skip this assertion for GPU, address spaces are present
     if (!IsGPU) {
       assert(
@@ -3092,73 +3234,103 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
 
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
-    ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait,
-    bool IsTeamsReduction, bool HasDistribute) {
-  checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
-  LLVMContext &Ctx = M.getContext();
+    InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
+    bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
+    ReductionGenCBTy ReductionGenCBTy, std::optional<omp::GV> GridValue,
+    unsigned ReductionBufNum, Value *SrcLocInfo) {
   if (!updateToLocation(Loc))
     return InsertPointTy();
+  Builder.restoreIP(CodeGenIP);
+  checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
+  LLVMContext &Ctx = M.getContext();
+
+  // Source location for the ident struct
+  if (!SrcLocInfo) {
+    uint32_t SrcLocStrSize;
+    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+    SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  }
 
   if (ReductionInfos.size() == 0)
     return Builder.saveIP();
 
-  assert(ReductionInfos.size() == 1 && "More than one reduction variable");
+  Function *CurFunc = Builder.GetInsertBlock()->getParent();
+  AttributeList FuncAttrs;
+  AttrBuilder AttrBldr(Ctx);
+  for (auto Attr : CurFunc->getAttributes().getFnAttrs())
+    AttrBldr.addAttribute(Attr);
+  AttrBldr.removeAttribute(Attribute::OptimizeNone);
+  FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
 
-  // Copied code from createReductions
-  BasicBlock *InsertBlock = Loc.IP.getBlock();
-  BasicBlock *ContinuationBlock =
-      InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
-  InsertBlock->getTerminator()->eraseFromParent();
-  Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
+  // Set the grid value in the config needed for lowering later on
+  if (GridValue.has_value())
+    Config.setGridValue(GridValue.value());
+  else
+    Config.setGridValue(getGridValue(T, Config.TargetFeatures));
 
   Function *ReductionFunc = nullptr;
   if (GLOBAL_ReductionFunc) {
     ReductionFunc = GLOBAL_ReductionFunc;
   } else {
-    ReductionFunc = getFreshReductionFunc(M);
-    GLOBAL_ReductionFunc = ReductionFunc;
-    InsertPointTy CurIP = Builder.saveIP();
-    populateReductionFunction(ReductionFunc, ReductionInfos, Builder, true);
-    Builder.restoreIP(CurIP);
+    CodeGenIP = Builder.saveIP();
+    ReductionFunc = createReductionFunction(
+        Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, true,
+        ReductionGenCBTy, FuncAttrs);
+    Builder.restoreIP(CodeGenIP);
   }
 
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
   Value *RTLoc =
-      getOrCreateIdent(SrcLocStr, SrcLocStrSize, llvm::omp::IdentFlag(0), 0);
+      getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
+
+  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
+  // RedList, shuffle_reduce_func, interwarp_copy_func);
+  // or
+  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
+  Value *Res;
 
-  // 1. Build a list of reduction variables
+  // 1. Build a list of reduction variables.
+  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   auto Size = ReductionInfos.size();
-  // FIXME(JAN): skipping variably modified type storage for array size
   Type *PtrTy = PointerType::getUnqual(Ctx);
   Type *RedArrayTy = ArrayType::get(PtrTy, Size);
-  InsertPointTy CurIP = Builder.saveIP();
+  CodeGenIP = Builder.saveIP();
   Builder.restoreIP(AllocaIP);
   Value *ReductionListAlloca =
       Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
-  Value *ReductionList =
-      Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionListAlloca, PtrTy);
-  Builder.restoreIP(CurIP);
+  Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
+  Builder.restoreIP(CodeGenIP);
+  Type *IndexTy = Builder.getIndexTy(
+      M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
   for (auto En : enumerate(ReductionInfos)) {
     const ReductionInfo &RI = En.value();
-    Value *ElemPtr = Builder.CreateConstGEP2_64(RedArrayTy, ReductionList, 0,
-                                                En.index(), "elem_ptr");
+    Value *ElemPtr = Builder.CreateInBoundsGEP(
+        RedArrayTy, ReductionList,
+        {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
     Value *CastElem =
         Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
     Builder.CreateStore(CastElem, ElemPtr);
   }
-  CurIP = Builder.saveIP();
-  Function *SarFunc = emitShuffleAndReduceFunction(M, Loc, ReductionInfos,
-                                                   ReductionFunc, *this);
-  Function *WcFunc = emitInterWarpCopyFunction(M, Loc, ReductionInfos, *this);
-  Builder.restoreIP(CurIP);
+  CodeGenIP = Builder.saveIP();
+  Function *SarFunc =
+      emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
+  Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
+  Builder.restoreIP(CodeGenIP);
 
-  Value *RL =
-    Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
-  Value *ReductionDataSize =
-      getTypeSizeInBytesValue(Builder, M, ReductionInfos.begin()->ElementType);
+  Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
 
-  Value *Res;
+  unsigned MaxDataSize = 0;
+  SmallVector<Type *> ReductionTypeArgs;
+  for (auto En : enumerate(ReductionInfos)) {
+    auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
+    if (Size > MaxDataSize)
+      MaxDataSize = Size;
+    ReductionTypeArgs.emplace_back(En.value().ElementType);
+  }
+  Value *ReductionDataSize =
+      Builder.getInt64(MaxDataSize * ReductionInfos.size());
   if (!IsTeamsReduction) {
     Value *SarFuncCast =
         Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, PtrTy);
@@ -3169,25 +3341,27 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
         RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
     Res = Builder.CreateCall(Pv2Ptr, Args);
   } else {
-    CurIP = Builder.saveIP();
-    Function *LtGCFunc =
-        emitListToGlobalCopyFunction(M, Loc, ReductionInfos, *this);
-    Function *LtGRFunc = emitListToGlobalReduceFunction(M, Loc, ReductionInfos,
-                                                        ReductionFunc, *this);
-    Function *GtLCFunc =
-        emitGlobalToListCopyFunction(M, Loc, ReductionInfos, *this);
-    Function *GtLRFunc = emitGlobalToListReduceFunction(M, Loc, ReductionInfos,
-                                                        ReductionFunc, *this);
-    Builder.restoreIP(CurIP);
-
+    CodeGenIP = Builder.saveIP();
+    StructType *ReductionsBufferTy = StructType::create(
+        Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
     Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
         RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
+    Function *LtGCFunc = emitListToGlobalCopyFunction(
+        ReductionInfos, ReductionsBufferTy, FuncAttrs);
+    Function *LtGRFunc = emitListToGlobalReduceFunction(
+        ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+    Function *GtLCFunc = emitGlobalToListCopyFunction(
+        ReductionInfos, ReductionsBufferTy, FuncAttrs);
+    Function *GtLRFunc = emitGlobalToListReduceFunction(
+        ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
+    Builder.restoreIP(CodeGenIP);
 
-    Value *KernelTeamsReductionPtr = Builder.CreateCall(RedFixedBuferFn, {});
+    Value *KernelTeamsReductionPtr = Builder.CreateCall(
+        RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
 
     Value *Args3[] = {RTLoc,
                       KernelTeamsReductionPtr,
-                      Builder.getInt32(1024),
+                      Builder.getInt32(ReductionBufNum),
                       ReductionDataSize,
                       RL,
                       SarFunc,
@@ -3202,32 +3376,50 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
     Res = Builder.CreateCall(TeamsReduceFn, Args3);
   }
 
-  if (IsTeamsReduction || !HasDistribute) {
-    Function *CurFunc = Builder.GetInsertBlock()->getParent();
-    BasicBlock *ExitBB =
-        BasicBlock::Create(Ctx, ".omp.reduction.done", CurFunc);
-    BasicBlock *ThenBB =
-        BasicBlock::Create(Ctx, ".omp.reduction.then", CurFunc);
-    Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
-    Builder.CreateCondBr(Cond, ThenBB, ExitBB);
+  // 5. Build if (res == 1)
+  BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
+  BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
+  Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
+  Builder.CreateCondBr(Cond, ThenBB, ExitBB);
 
-    Builder.SetInsertPoint(ThenBB);
-    for (auto En : enumerate(ReductionInfos)) {
-      const ReductionInfo &RI = En.value();
-      Value *InputVal = Builder.CreateLoad(RI.ElementType, RI.Variable);
-      Value *RedVal = Builder.CreateLoad(
-          RI.ElementType, Builder.CreatePointerBitCastOrAddrSpaceCast(
-                              RI.PrivateVariable, PtrTy));
-      Value *sum;
-      Builder.restoreIP(
-          RI.ReductionGen(Builder.saveIP(), InputVal, RedVal, sum));
-      Builder.CreateStore(sum, RI.Variable);
-      Builder.CreateBr(ExitBB);
+  // 6. Build then branch: where we have reduced values in the master
+  //    thread in each team.
+  //    __kmpc_end_reduce{_nowait}(<gtid>);
+  //    break;
+  emitBlock(ThenBB, CurFunc);
+
+  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
+  for (auto En : enumerate(ReductionInfos)) {
+    const ReductionInfo &RI = En.value();
+    Value *LHS = RI.Variable;
+    Value *RHS =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
+
+    if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
+      Value *LHSPtr, *RHSPtr;
+      Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
+                                             &LHSPtr, &RHSPtr, CurFunc));
+
+      // Fix the CallBack code genereated to use the correct Values for the LHS
+      // and RHS
+      LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
+        return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+               ReductionFunc;
+      });
+      RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
+        return cast<Instruction>(U.getUser())->getParent()->getParent() ==
+               ReductionFunc;
+      });
+    } else {
+      // LHS = Builder.CreateLoad(LHS);
+      // LHS = Builder.CreateLoad(LHS);
+      // Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS));
     }
-    Builder.SetInsertPoint(ExitBB);
   }
-  Builder.CreateBr(ContinuationBlock);
-  Builder.SetInsertPoint(ContinuationBlock);
+  emitBlock(ExitBB, CurFunc);
+
+  Config.setEmitLLVMUsed();
+
   return Builder.saveIP();
 }
 
@@ -3236,8 +3428,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
     ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait,
     bool IsTeamsReduction, bool HasDistribute) {
   if (Config.isGPU())
-    return createReductionsGPU(Loc, AllocaIP, ReductionInfos, IsNoWait,
-                               IsTeamsReduction, HasDistribute);
+    return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
+                               IsNoWait, IsTeamsReduction, HasDistribute);
 
   checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
 
@@ -3275,10 +3467,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   Module *Module = Func->getParent();
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
-  bool CanGenerateAtomic =
-      llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
-        return RI.AtomicReductionGen;
-      });
+  bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
+    return RI.AtomicReductionGen;
+  });
   Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
                                   CanGenerateAtomic
                                       ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
@@ -3288,7 +3479,8 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   const DataLayout &DL = Module->getDataLayout();
   unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
   Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
-  Function *ReductionFunc = getFreshReductionFunc(*Module);
+  Function *ReductionFunc = createReductionFunction(
+      Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos);
   Value *Lock = getOMPCriticalRegionLock(".reduction");
   Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
       IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
@@ -3355,7 +3547,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   // Populate the outlined reduction function using the elementwise reduction
   // function. Partial values are extracted from the type-erased array of
   // pointers to private variables.
-  populateReductionFunction(ReductionFunc, ReductionInfos, Builder, false);
+  // populateReductionFunction(ReductionFunc, ReductionInfos, Builder, false);
   Builder.SetInsertPoint(ContinuationBlock);
   return Builder.saveIP();
 }
@@ -5690,7 +5882,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
                              Ident,
                              DynamicEnvironment,
                          });
-  Twine KernelEnvironmentName = KernelName + "_kernel_environment";
+  std::string KernelEnvironmentName =
+      (KernelName + "_kernel_environment").str();
   GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
       M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
       KernelEnvironmentInitializer, KernelEnvironmentName,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index ed7c3b29f88af5..a0db20e5bece5e 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -822,14 +822,17 @@ static void collectReductionInfo(
   // Collect the reduction information.
   reductionInfos.reserve(numReductions);
   for (unsigned i = 0; i < numReductions; ++i) {
-    llvm::OpenMPIRBuilder::AtomicReductionGenTy atomicGen = nullptr;
+    llvm::OpenMPIRBuilder::AtomicReductionGenCB atomicGen = nullptr;
     if (owningAtomicReductionGens[i])
       atomicGen = owningAtomicReductionGens[i];
     llvm::Value *variable =
         moduleTranslation.lookupValue(loop.getReductionVars()[i]);
-    reductionInfos.push_back(
-        {moduleTranslation.convertType(reductionDecls[i].getType()), variable,
-         privateReductionVariables[i], owningReductionGens[i], atomicGen});
+    reductionInfos.push_back(llvm::OpenMPIRBuilder::ReductionInfo(
+        moduleTranslation.convertType(reductionDecls[i].getType()), variable,
+        privateReductionVariables[i],
+        /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar,
+        owningReductionGens[i],
+        /*ReductionGenClang=*/nullptr, atomicGen));
   }
 }
 

>From 248e87c98a92a517962ded36ac15a3ac426e66a9 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 7 Feb 2024 14:23:41 +0000
Subject: [PATCH 03/10] Remove ReductionInfoManager. Add assert failures for
 complex data types.

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       | 47 -------------------
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 12 +++--
 2 files changed, 8 insertions(+), 51 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index abf05cc1e5e76a..0b446743ba0fbc 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1348,50 +1348,6 @@ class OpenMPIRBuilder {
     Value *ScratchpadWidth = nullptr;
   };
 
-  /// A class that manages the reduction info to facilitate lowering of
-  /// reductions at multiple levels of parallelism. For example handling teams
-  /// and parallel reductions on GPUs
-  class ReductionInfoManager {
-  private:
-    SmallVector<ReductionInfo> ReductionInfos;
-    std::optional<InsertPointTy> PrivateVarAllocaIP;
-
-  public:
-    ReductionInfoManager(){};
-    void clear() {
-      ReductionInfos.clear();
-      PrivateVarAllocaIP.reset();
-    }
-
-    Value *allocatePrivateReductionVar(IRBuilderBase &builder,
-                                       InsertPointTy &allocaIP, Type *VarType) {
-      Type *ptrTy = PointerType::getUnqual(builder.getContext());
-      Value *var = builder.CreateAlloca(VarType);
-      var->setName("private_redvar");
-      Value *castVar = builder.CreatePointerBitCastOrAddrSpaceCast(var, ptrTy);
-      ReductionInfos.push_back(ReductionInfo(castVar));
-      return castVar;
-    }
-
-    ReductionInfo getReductionInfo(unsigned Index) {
-      return ReductionInfos[Index];
-    }
-    ReductionInfo setReductionInfo(unsigned Index, ReductionInfo &RI) {
-      return ReductionInfos[Index] = RI;
-    }
-    Value *getPrivateReductionVariable(unsigned Index) {
-      return ReductionInfos[Index].PrivateVariable;
-    }
-    SmallVector<ReductionInfo> &getReductionInfos() { return ReductionInfos; }
-
-    bool hasPrivateVarAllocaIP() { return PrivateVarAllocaIP.has_value(); }
-    InsertPointTy getPrivateVarAllocaIP() {
-      assert(PrivateVarAllocaIP.has_value() && "AllocaIP not set");
-      return *PrivateVarAllocaIP;
-    }
-    void setPrivateVarAllocaIP(InsertPointTy IP) { PrivateVarAllocaIP = IP; }
-  };
-
   /// Supporting functions for Reductions CodeGen.
 private:
   /// Emit the llvm.used metadata.
@@ -2086,9 +2042,6 @@ class OpenMPIRBuilder {
   /// Info manager to keep track of target regions.
   OffloadEntriesInfoManager OffloadInfoManager;
 
-  /// Info manager to keep track of reduction information;
-  ReductionInfoManager RIManager;
-
   /// The target triple of the underlying module.
   const Triple T;
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index e3054ab2e5d064..276dae628c1450 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2319,6 +2319,7 @@ void OpenMPIRBuilder::emitReductionListCopy(
         break;
       }
       case EvaluationKindTy::Complex: {
+        assert(false && "Complex data type not handled");
         break;
       }
       case EvaluationKindTy::Aggregate: {
@@ -2824,9 +2825,10 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
       break;
     }
     case EvaluationKindTy::Complex: {
+      assert(false && "Complex data type not handled");
       break;
     }
-    case EvaluationKindTy::Aggregate:
+    case EvaluationKindTy::Aggregate: {
       Value *SizeVal =
           Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
       Builder.CreateMemCpy(
@@ -2834,6 +2836,7 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
           M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
       break;
     }
+    }
   }
 
   Builder.CreateRetVoid();
@@ -2998,10 +3001,10 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
       break;
     }
     case EvaluationKindTy::Complex: {
-      // FIXME(Jan): Complex type
+      assert(false && "Complex data type not handled");
       break;
     }
-    case EvaluationKindTy::Aggregate:
+    case EvaluationKindTy::Aggregate: {
       Value *SizeVal =
           Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
       Builder.CreateMemCpy(
@@ -3010,6 +3013,7 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
           SizeVal, false);
       break;
     }
+    }
   }
 
   Builder.CreateRetVoid();
@@ -3220,7 +3224,6 @@ checkReductionInfos(ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
     assert(RI.PrivateVariable && "expected non-null private variable");
     assert((RI.ReductionGen || RI.ReductionGenClang) &&
            "expected non-null reduction generator callback");
-    // JAN: Skip this assertion for GPU, address spaces are present
     if (!IsGPU) {
       assert(
           RI.Variable->getType() == RI.PrivateVariable->getType() &&
@@ -4067,6 +4070,7 @@ static void createTargetLoopWorkshareCall(
   // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
   // not be the right way to fix it, but this works for now.
   if (OMPBuilder->Config.isGPU()) {
+    assert(false && "Akash");
     if (LoopType != WorksharingLoopType::DistributeStaticLoop)
       Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
     LLVMContext &Ctx = M.getContext();

>From 9244fd125745a8e655476510983ab59a8f543c0d Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 7 Feb 2024 14:27:15 +0000
Subject: [PATCH 04/10] Remove unintended debug assert added in previous
 commit.

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 276dae628c1450..e6c01e08e1a8cf 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4070,7 +4070,6 @@ static void createTargetLoopWorkshareCall(
   // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
   // not be the right way to fix it, but this works for now.
   if (OMPBuilder->Config.isGPU()) {
-    assert(false && "Akash");
     if (LoopType != WorksharingLoopType::DistributeStaticLoop)
       Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
     LLVMContext &Ctx = M.getContext();

>From 604456eae48830355e237b03e581e9f604c47c7e Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Tue, 13 Feb 2024 12:48:38 +0000
Subject: [PATCH 05/10] Remove unintended code from MLIRTranslation.

---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 23 +++++--------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index a0db20e5bece5e..6b3e244267a5c2 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -827,12 +827,12 @@ static void collectReductionInfo(
       atomicGen = owningAtomicReductionGens[i];
     llvm::Value *variable =
         moduleTranslation.lookupValue(loop.getReductionVars()[i]);
-    reductionInfos.push_back(llvm::OpenMPIRBuilder::ReductionInfo(
-        moduleTranslation.convertType(reductionDecls[i].getType()), variable,
-        privateReductionVariables[i],
-        /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar,
-        owningReductionGens[i],
-        /*ReductionGenClang=*/nullptr, atomicGen));
+    reductionInfos.push_back(
+        {moduleTranslation.convertType(reductionDecls[i].getType()), variable,
+         privateReductionVariables[i],
+         /*EvaluationKind=*/llvm::OpenMPIRBuilder::EvaluationKindTy::Scalar,
+         owningReductionGens[i],
+         /*ReductionGenClang=*/nullptr, atomicGen});
   }
 }
 
@@ -2908,17 +2908,6 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::amendOperation(
             }
             return failure();
           })
-      .Case("omp.target",
-            [&](Attribute attr) {
-              if (auto targetAttr = attr.dyn_cast<omp::TargetAttr>()) {
-                llvm::OpenMPIRBuilderConfig &config =
-                    moduleTranslation.getOpenMPBuilder()->Config;
-                config.TargetCPU = targetAttr.getTargetCpu();
-                config.TargetFeatures = targetAttr.getTargetFeatures();
-                return success();
-              }
-              return failure();
-            })
       .Default([](Attribute) {
         // Fall through for omp attributes that do not require lowering.
         return success();

>From 5f0b81744ab1ab9c1ebf3d0d2a2100171d421929 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Tue, 13 Feb 2024 13:48:43 +0000
Subject: [PATCH 06/10] Remove unintended code for WSLoop

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index e6c01e08e1a8cf..97855ed80f3156 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4063,24 +4063,10 @@ getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
 static void createTargetLoopWorkshareCall(
     OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
     BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
-    Type *ParallelTaskPtr, Value *TripCountOrig, Function &LoopBodyFn) {
+    Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
+  Type *TripCountTy = TripCount->getType();
   Module &M = OMPBuilder->M;
   IRBuilder<> &Builder = OMPBuilder->Builder;
-  Value *TripCount = TripCountOrig;
-  // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
-  // not be the right way to fix it, but this works for now.
-  if (OMPBuilder->Config.isGPU()) {
-    if (LoopType != WorksharingLoopType::DistributeStaticLoop)
-      Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
-    LLVMContext &Ctx = M.getContext();
-    Type *IVTy = TripCountOrig->getType();
-    Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32
-                             ? Type::getInt32Ty(Ctx)
-                             : Type::getInt64Ty(Ctx);
-    Constant *One = ConstantInt::get(InternalIVTy, 1);
-    TripCount = Builder.CreateSub(TripCountOrig, One, "modified_trip_count");
-  }
-  Type *TripCountTy = TripCount->getType();
   FunctionCallee RTLFn =
       getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
   SmallVector<Value *, 8> RealArgs;

>From ef3f5a0e4d6ce0080c21f6fe3412baff16902dd7 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Tue, 13 Feb 2024 15:02:45 +0000
Subject: [PATCH 07/10] Remove more non-essential code to this patch.

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  5 ---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 34 +++++--------------
 2 files changed, 9 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 0b446743ba0fbc..2db4708a0ac60d 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -106,11 +106,6 @@ class OpenMPIRBuilderConfig {
   /// Flag for specifying if offloading is mandatory.
   std::optional<bool> OpenMPOffloadMandatory;
 
-  /// Name of the target processor.
-  StringRef TargetCPU;
-  /// String representation of the target processor's features.
-  StringRef TargetFeatures;
-
   /// First separator used between the initial two parts of a name.
   std::optional<StringRef> FirstSeparator;
   /// Separator used between all of the rest consecutive parts of s name
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 97855ed80f3156..f76a8cd8107aed 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -147,22 +147,13 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
 
 Function *GLOBAL_ReductionFunc = nullptr;
 
-static const omp::GV &getGridValue(const Triple &T, StringRef Features) {
-  if (T.isAMDGPU()) {
-    if (Features.count("+wavefrontsize64"))
-      return omp::getAMDGPUGridValues<64>();
-    return omp::getAMDGPUGridValues<32>();
-  }
-  if (T.isNVPTX())
-    return omp::NVPTXGridValues;
-  llvm_unreachable("No grid value available for this architecture!");
-}
-
 static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
   if (T.isAMDGPU()) {
     StringRef Features =
         Kernel->getFnAttribute("target-features").getValueAsString();
-    return getGridValue(T, Features);
+    if (Features.count("+wavefrontsize64"))
+      return omp::getAMDGPUGridValues<64>();
+    return omp::getAMDGPUGridValues<32>();
   }
   if (T.isNVPTX())
     return omp::NVPTXGridValues;
@@ -3265,12 +3256,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
   AttrBldr.removeAttribute(Attribute::OptimizeNone);
   FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
 
-  // Set the grid value in the config needed for lowering later on
-  if (GridValue.has_value())
-    Config.setGridValue(GridValue.value());
-  else
-    Config.setGridValue(getGridValue(T, Config.TargetFeatures));
-
   Function *ReductionFunc = nullptr;
   if (GLOBAL_ReductionFunc) {
     ReductionFunc = GLOBAL_ReductionFunc;
@@ -3282,6 +3267,12 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
     Builder.restoreIP(CodeGenIP);
   }
 
+  // Set the grid value in the config needed for lowering later on
+  if (GridValue.has_value())
+    Config.setGridValue(GridValue.value());
+  else
+    Config.setGridValue(getGridValue(T, ReductionFunc));
+
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
   Value *RTLoc =
@@ -5804,9 +5795,6 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
 
   Function *Kernel = Builder.GetInsertBlock()->getParent();
 
-  // Set the grid value in the config needed for lowering later on
-  Config.setGridValue(getGridValue(T, Config.TargetFeatures));
-
   // Manifest the launch configuration in the metadata matching the kernel
   // environment.
   if (MinTeamsVal > 1 || MaxTeamsVal > 0)
@@ -6060,10 +6048,6 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
     OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
     if (T.isAMDGCN())
       OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
-    if (!Config.TargetCPU.empty())
-      OutlinedFn->addFnAttr("target-cpu", Config.TargetCPU);
-    if (!Config.TargetFeatures.empty())
-      OutlinedFn->addFnAttr("target-features", Config.TargetFeatures);
   }
 }
 

>From cb92e35d630b7a85b71a6887446a892b0d203e00 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 14 Feb 2024 13:38:53 +0000
Subject: [PATCH 08/10] Update OMPIRBuilderTest.cpp.

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp      | 18 +++++++-----------
 .../unittests/Frontend/OpenMPIRBuilderTest.cpp | 16 ++++++++++++----
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f76a8cd8107aed..01d6a1e162455e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3155,7 +3155,7 @@ Function *OpenMPIRBuilder::createReductionFunction(
         RedArrayTy, RHSArrayPtr,
         {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
     Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
-    Value *RHS = Builder.CreatePointerBitCastOrAddrSpaceCast(
+    Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
         RHSI8Ptr, RI.PrivateVariable->getType(),
         RHSI8Ptr->getName() + ".ascast");
 
@@ -3163,20 +3163,20 @@ Function *OpenMPIRBuilder::createReductionFunction(
         RedArrayTy, LHSArrayPtr,
         {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
     Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
-    Value *LHS = Builder.CreatePointerBitCastOrAddrSpaceCast(
+    Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
         LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
 
     if (ReductionGenCBTy == ReductionGenCBTy::Clang) {
-      LHSPtrs.emplace_back(LHS);
-      RHSPtrs.emplace_back(RHS);
+      LHSPtrs.emplace_back(LHSPtr);
+      RHSPtrs.emplace_back(RHSPtr);
     } else {
-      LHS = Builder.CreateLoad(RI.ElementType, LHS);
-      RHS = Builder.CreateLoad(RI.ElementType, RHS);
+      Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+      Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
       Value *Reduced;
       RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
       if (!Builder.GetInsertBlock())
         return ReductionFunc;
-      Builder.CreateStore(Reduced, LHS);
+      Builder.CreateStore(Reduced, LHSPtr);
     }
   }
 
@@ -3538,10 +3538,6 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
     Builder.CreateUnreachable();
   }
 
-  // Populate the outlined reduction function using the elementwise reduction
-  // function. Partial values are extracted from the type-erased array of
-  // pointers to private variables.
-  // populateReductionFunction(ReductionFunc, ReductionInfos, Builder, false);
   Builder.SetInsertPoint(ContinuationBlock);
   return Builder.saveIP();
 }
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index e79d0bb2f65aea..2d50964b700d3f 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -4835,8 +4835,12 @@ TEST_F(OpenMPIRBuilderTest, CreateReductions) {
   Builder.restoreIP(AfterIP);
 
   OpenMPIRBuilder::ReductionInfo ReductionInfos[] = {
-      {SumType, SumReduced, SumPrivatized, sumReduction, sumAtomicReduction},
-      {XorType, XorReduced, XorPrivatized, xorReduction, xorAtomicReduction}};
+      {SumType, SumReduced, SumPrivatized,
+       /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+       sumReduction, /*ReductionGenClang=*/nullptr, sumAtomicReduction},
+      {XorType, XorReduced, XorPrivatized,
+       /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+       xorReduction, /*ReductionGenClang=*/nullptr, xorAtomicReduction}};
 
   OMPBuilder.createReductions(BodyIP, BodyAllocaIP, ReductionInfos);
 
@@ -5087,10 +5091,14 @@ TEST_F(OpenMPIRBuilderTest, CreateTwoReductions) {
 
   OMPBuilder.createReductions(
       FirstBodyIP, FirstBodyAllocaIP,
-      {{SumType, SumReduced, SumPrivatized, sumReduction, sumAtomicReduction}});
+      {{SumType, SumReduced, SumPrivatized,
+        /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+        sumReduction, /*ReductionGenClang=*/nullptr, sumAtomicReduction}});
   OMPBuilder.createReductions(
       SecondBodyIP, SecondBodyAllocaIP,
-      {{XorType, XorReduced, XorPrivatized, xorReduction, xorAtomicReduction}});
+      {{XorType, XorReduced, XorPrivatized,
+        /*EvaluationKind=*/OpenMPIRBuilder::EvaluationKindTy::Scalar,
+        xorReduction, /*ReductionGenClang=*/nullptr, xorAtomicReduction}});
 
   Builder.restoreIP(AfterIP);
   Builder.CreateRetVoid();

>From 9e64802be03bedd2fc26599182ce2ab60cdd26a8 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 14 Feb 2024 15:48:55 +0000
Subject: [PATCH 09/10] Fix cpu reductions.

---
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   1 -
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 103 +++++++++++++-----
 2 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 2db4708a0ac60d..6978b621168a2e 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1572,7 +1572,6 @@ class OpenMPIRBuilder {
   /// \return The reduction function.
   Function *createReductionFunction(
       StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
-      bool IsGpu = false,
       ReductionGenCBTy ReductionGenCBTy = ReductionGenCBTy::MLIR,
       AttributeList FuncAttrs = {});
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 01d6a1e162455e..50ac126c5c91ba 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3102,13 +3102,13 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
 }
 
 Function *OpenMPIRBuilder::createReductionFunction(
-    StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos, bool IsGPU,
+    StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
     ReductionGenCBTy ReductionGenCBTy, AttributeList FuncAttrs) {
   auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
                                    {Builder.getPtrTy(), Builder.getPtrTy()},
                                    /* IsVarArg */ false);
   std::string Name = getReductionFuncName(ReducerName);
-  auto *ReductionFunc =
+  Function *ReductionFunc =
       Function::Create(FuncTy, GlobalVariable::InternalLinkage, Name, &M);
   ReductionFunc->setAttributes(FuncAttrs);
   ReductionFunc->addParamAttr(0, Attribute::NoUndef);
@@ -3117,33 +3117,27 @@ Function *OpenMPIRBuilder::createReductionFunction(
       BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
   Builder.SetInsertPoint(EntryBB);
 
+  // Need to alloca memory here and deal with the pointers before getting
+  // LHS/RHS pointers out
   Value *LHSArrayPtr = nullptr;
   Value *RHSArrayPtr = nullptr;
-  if (IsGPU) {
-    // Need to alloca memory here and deal with the pointers before getting
-    // LHS/RHS pointers out
-    //
-    Argument *Arg0 = ReductionFunc->getArg(0);
-    Argument *Arg1 = ReductionFunc->getArg(1);
-    Type *Arg0Type = Arg0->getType();
-    Type *Arg1Type = Arg1->getType();
-
-    Value *LHSAlloca =
-        Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
-    Value *RHSAlloca =
-        Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
-    Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-        LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
-    Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
-        RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
-    Builder.CreateStore(Arg0, LHSAddrCast);
-    Builder.CreateStore(Arg1, RHSAddrCast);
-    LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
-    RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
-  } else {
-    LHSArrayPtr = ReductionFunc->getArg(0);
-    RHSArrayPtr = ReductionFunc->getArg(1);
-  }
+  Argument *Arg0 = ReductionFunc->getArg(0);
+  Argument *Arg1 = ReductionFunc->getArg(1);
+  Type *Arg0Type = Arg0->getType();
+  Type *Arg1Type = Arg1->getType();
+
+  Value *LHSAlloca =
+      Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
+  Value *RHSAlloca =
+      Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
+  Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
+  Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
+      RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
+  Builder.CreateStore(Arg0, LHSAddrCast);
+  Builder.CreateStore(Arg1, RHSAddrCast);
+  LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
+  RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
 
   Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
   Type *IndexTy = Builder.getIndexTy(
@@ -3417,6 +3411,54 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
   return Builder.saveIP();
 }
 
+static Function *getFreshReductionFunc(Module &M) {
+  Type *VoidTy = Type::getVoidTy(M.getContext());
+  Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
+  auto *FuncTy =
+      FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
+  return Function::Create(FuncTy, GlobalVariable::InternalLinkage,
+                          ".omp.reduction.func", &M);
+}
+
+static void populateReductionFunction(
+    Function *ReductionFunc,
+    ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
+    IRBuilder<> &Builder) {
+  Module *Module = ReductionFunc->getParent();
+  BasicBlock *ReductionFuncBlock =
+      BasicBlock::Create(Module->getContext(), "", ReductionFunc);
+  Builder.SetInsertPoint(ReductionFuncBlock);
+  Value *LHSArrayPtr = nullptr;
+  Value *RHSArrayPtr = nullptr;
+  LHSArrayPtr = ReductionFunc->getArg(0);
+  RHSArrayPtr = ReductionFunc->getArg(1);
+
+  unsigned NumReductions = ReductionInfos.size();
+  Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
+
+  for (auto En : enumerate(ReductionInfos)) {
+    const OpenMPIRBuilder::ReductionInfo &RI = En.value();
+    Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedArrayTy, LHSArrayPtr, 0, En.index());
+    Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
+    Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        LHSI8Ptr, RI.Variable->getType());
+    Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
+    Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
+        RedArrayTy, RHSArrayPtr, 0, En.index());
+    Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
+    Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        RHSI8Ptr, RI.PrivateVariable->getType());
+    Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
+    Value *Reduced;
+    Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
+    if (!Builder.GetInsertBlock())
+      return;
+    Builder.CreateStore(Reduced, LHSPtr);
+  }
+  Builder.CreateRetVoid();
+}
+
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait,
@@ -3473,8 +3515,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   const DataLayout &DL = Module->getDataLayout();
   unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
   Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
-  Function *ReductionFunc = createReductionFunction(
-      Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos);
+  Function *ReductionFunc = getFreshReductionFunc(M);
   Value *Lock = getOMPCriticalRegionLock(".reduction");
   Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
       IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
@@ -3538,6 +3579,10 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
     Builder.CreateUnreachable();
   }
 
+  // Populate the outlined reduction function using the elementwise reduction
+  // function. Partial values are extracted from the type-erased array of
+  // pointers to private variables.
+  populateReductionFunction(ReductionFunc, ReductionInfos, Builder);
   Builder.SetInsertPoint(ContinuationBlock);
   return Builder.saveIP();
 }

>From 89c0f65295b64f308426156fc16895ab2d4a7a27 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <Akash.Banerjee at amd.com>
Date: Wed, 14 Feb 2024 16:11:26 +0000
Subject: [PATCH 10/10] Fix nit issue,

---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 50ac126c5c91ba..4f14580b6686f8 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -3256,7 +3256,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductionsGPU(
   } else {
     CodeGenIP = Builder.saveIP();
     ReductionFunc = createReductionFunction(
-        Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, true,
+        Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos,
         ReductionGenCBTy, FuncAttrs);
     Builder.restoreIP(CodeGenIP);
   }