[clang] [OpenMP] Make team reductions less bad (PR #70981)

via cfe-commits cfe-commits at lists.llvm.org
Wed Nov 1 14:02:02 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang-codegen

Author: Johannes Doerfert (jdoerfert)

<details>
<summary>Changes</summary>

This removes an unnuessary early synchronization in the copy function, removes the unused thread id argument, and uses 64 bit operations for the copy through shared memory if the reduction element has at least 64 bit. None of which should impact anything but performance.

---

Patch is 190.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70981.diff


7 Files Affected:

- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+33-24) 
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.h (+3-3) 
- (modified) clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp (+231-297) 
- (modified) clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp (+58-76) 
- (modified) clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp (+321-396) 
- (modified) clang/test/OpenMP/reduction_implicit_map.cpp (+33-49) 
- (modified) clang/test/OpenMP/target_teams_generic_loop_codegen.cpp (+56-60) 


``````````diff
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 0ed665e0dfb9722..76aa1688616a81d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -18,9 +18,12 @@
 #include "clang/AST/OpenMPClause.h"
 #include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/AST/Type.h"
 #include "clang/Basic/Cuda.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/Support/MathExtras.h"
 
 using namespace clang;
@@ -1342,7 +1345,7 @@ void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
   // Build call __kmpc_cancel_barrier(loc, thread_id);
   unsigned Flags = getDefaultFlagsForBarriers(Kind);
   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
-                         getThreadID(CGF, Loc)};
+                         llvm::ConstantInt::getNullValue(CGF.Int32Ty)};
 
   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
                           CGM.getModule(), OMPRTL___kmpc_barrier),
@@ -1798,7 +1801,6 @@ static void emitReductionListCopy(
 /// void inter_warp_copy_func(void* reduce_data, num_warps)
 ///   shared smem[warp_size];
 ///   For all data entries D in reduce_data:
-///     sync
 ///     If (I am the first lane in each warp)
 ///       Copy my local D to smem[warp_id]
 ///     sync
@@ -1844,21 +1846,27 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
   // for reduced latency, as well as to have a distinct copy for concurrently
   // executing target regions.  The array is declared with common linkage so
   // as to be shared across compilation units.
-  StringRef TransferMediumName =
-      "__openmp_nvptx_data_transfer_temporary_storage";
-  llvm::GlobalVariable *TransferMedium =
-      M.getGlobalVariable(TransferMediumName);
-  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
-  if (!TransferMedium) {
-    auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
-    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
-    TransferMedium = new llvm::GlobalVariable(
-        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
-        llvm::UndefValue::get(Ty), TransferMediumName,
-        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
-    CGM.addCompilerUsedGlobal(TransferMedium);
-  }
+  auto CreateSharedMem = [&](llvm::IntegerType *ElementTy, StringRef Suffix) {
+    std::string TransferMediumName =
+        "__openmp_nvptx_data_transfer_temporary_storage";
+    TransferMediumName += Suffix;
+    llvm::GlobalVariable *TransferMedium =
+        M.getGlobalVariable(TransferMediumName);
+    unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
+    if (!TransferMedium) {
+      auto *Ty = llvm::ArrayType::get(ElementTy, WarpSize);
+      unsigned SharedAddressSpace =
+          C.getTargetAddressSpace(LangAS::cuda_shared);
+      TransferMedium = new llvm::GlobalVariable(
+          M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
+          llvm::UndefValue::get(Ty), TransferMediumName,
+          /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
+          SharedAddressSpace);
+    }
+    return TransferMedium;
+  };
+  llvm::GlobalVariable *TransferMedium4 = CreateSharedMem(CGM.Int32Ty, "_4");
+  llvm::GlobalVariable *TransferMedium8 = CreateSharedMem(CGM.Int64Ty, "_8");
 
   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
   // Get the CUDA thread id of the current OpenMP thread on the GPU.
@@ -1888,7 +1896,11 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
         C.getTypeSizeInChars(Private->getType())
             .alignTo(C.getTypeAlignInChars(Private->getType()))
             .getQuantity();
-    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
+
+    llvm::GlobalVariable *TransferMedium =
+        RealTySize >= 8 ? TransferMedium8 : TransferMedium4;
+    for (unsigned TySize = RealTySize >= 8 ? 8 : 4;
+         TySize > 0 && RealTySize > 0; TySize /= 2) {
       unsigned NumIters = RealTySize / TySize;
       if (NumIters == 0)
         continue;
@@ -1916,10 +1928,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
         Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
         CGF.EmitBlock(BodyBB);
       }
-      // kmpc_barrier.
-      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
-                                             /*EmitChecks=*/false,
-                                             /*ForceSimpleCall=*/true);
+
       llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
       llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
       llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
@@ -1958,7 +1967,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
           ElemPtr, /*Volatile=*/false, CType, Loc,
           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
       // Store the source element value to the dest element address.
-      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
+      CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false, CType,
                             LValueBaseInfo(AlignmentSource::Type),
                             TBAAAccessInfo());
 
@@ -2014,7 +2023,7 @@ static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
 
       // *TargetElemPtr = SrcMediumVal;
       llvm::Value *SrcMediumValue =
-          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
+          CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/false, CType, Loc);
       CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
                             CType);
       Bld.CreateBr(W0MergeBB);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 141436f26230dde..09e20cf50a95e63 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -44,6 +44,9 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
     DS_Generic,
   };
 
+  /// Get barrier to synchronize all threads in a block.
+  void syncCTAThreads(CodeGenFunction &CGF);
+
 private:
   /// Parallel outlined function work for workers to execute.
   llvm::SmallVector<llvm::Function *, 16> Work;
@@ -56,9 +59,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
 
   DataSharingMode getDataSharingMode() const;
 
-  /// Get barrier to synchronize all threads in a block.
-  void syncCTAThreads(CodeGenFunction &CGF);
-
   /// Helper for target directive initialization.
   void emitKernelInit(const OMPExecutableDirective &D, CodeGenFunction &CGF,
                       EntryFunctionState &EST, bool IsSPMD);
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index 094c5ae3522f96d..968bd49647e0751 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -225,56 +225,40 @@ int bar(int n){
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP3]], 31
 // CHECK-64-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
-// CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK-64-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4
-// CHECK-64-NEXT:    br label [[PRECOND:%.*]]
-// CHECK-64:       precond:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4
-// CHECK-64-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
-// CHECK-64-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
-// CHECK-64:       body:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP4]], 5
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK-64:       then:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8
-// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
-// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
-// CHECK-64-NEXT:    store volatile i32 [[TMP13]], ptr addrspace(3) [[TMP12]], align 4
+// CHECK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_8, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP7]], align 8
+// CHECK-64-NEXT:    store i64 [[TMP9]], ptr addrspace(3) [[TMP8]], align 8
 // CHECK-64-NEXT:    br label [[IFCONT:%.*]]
 // CHECK-64:       else:
 // CHECK-64-NEXT:    br label [[IFCONT]]
 // CHECK-64:       ifcont:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4
-// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
+// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2:[0-9]+]], i32 0)
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP10]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK-64:       then2:
-// CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
-// CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4
-// CHECK-64-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i64], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_8, i64 0, i32 [[TMP2]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i64, ptr addrspace(3) [[TMP11]], align 8
+// CHECK-64-NEXT:    store i64 [[TMP14]], ptr [[TMP13]], align 8
 // CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK-64:       else3:
 // CHECK-64-NEXT:    br label [[IFCONT4]]
 // CHECK-64:       ifcont4:
-// CHECK-64-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4
-// CHECK-64-NEXT:    br label [[PRECOND]]
-// CHECK-64:       exit:
 // CHECK-64-NEXT:    ret void
 //
 //
@@ -451,65 +435,62 @@ int bar(int n){
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP3]], 31
 // CHECK-64-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
-// CHECK-64-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-64-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP4]], 5
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK-64:       then:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8
-// CHECK-64-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
-// CHECK-64-NEXT:    store volatile i8 [[TMP10]], ptr addrspace(3) [[TMP9]], align 1
+// CHECK-64-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-64-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_4, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
+// CHECK-64-NEXT:    store i8 [[TMP9]], ptr addrspace(3) [[TMP8]], align 1
 // CHECK-64-NEXT:    br label [[IFCONT:%.*]]
 // CHECK-64:       else:
 // CHECK-64-NEXT:    br label [[IFCONT]]
 // CHECK-64:       ifcont:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTADDR1]], align 4
-// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP11]]
+// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 0)
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP2]], [[TMP10]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK-64:       then2:
-// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
-// CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP13]], align 8
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load volatile i8, ptr addrspace(3) [[TMP12]], align 1
-// CHECK-64-NEXT:    store i8 [[TMP15]], ptr [[TMP14]], align 1
+// CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_4, i64 0, i32 [[TMP2]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP5]], i64 0, i64 0
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP12]], align 8
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i8, ptr addrspace(3) [[TMP11]], align 1
+// CHECK-64-NEXT:    store i8 [[TMP14]], ptr [[TMP13]], align 1
 // CHECK-64-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK-64:       else3:
 // CHECK-64-NEXT:    br label [[IFCONT4]]
 // CHECK-64:       ifcont4:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
 // CHECK-64-NEXT:    [[WARP_MASTER5:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0
 // CHECK-64-NEXT:    br i1 [[WARP_MASTER5]], label [[THEN6:%.*]], label [[ELSE7:%.*]]
 // CHECK-64:       then6:
-// CHECK-64-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8
-// CHECK-64-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK-64-NEXT:    store volatile i32 [[TMP19]], ptr addrspace(3) [[TMP18]], align 4
+// CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP5]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8
+// CHECK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_4, i64 0, i32 [[NVPTX_WARP_ID]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP18]], ptr addrspace(3) [[TMP17]], align 4
 // CHECK-64-NEXT:    br label [[IFCONT8:%.*]]
 // CHECK-64:       else7:
 // CHECK-64-NEXT:    br label [[IFCONT8]]
 // CHECK-64:       ifcont8:
-// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTADDR1]], align 4
-// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP3]], [[TMP20]]
+// CHECK-64-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 0)
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[IS_ACTIVE_THREAD9:%.*]] = icmp ult i32 [[TMP2]], [[TMP19]]
 // CHECK-64-NEXT:    br i1 [[IS_ACTIVE_THREAD9]], label [[THEN10:%.*]], label [[ELSE11:%.*]]
 // CHECK-64:       then10:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP6]], i64 0, i64 1
-// CHECK-64-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP22]], align 8
-// CHECK-64-NEXT:    [[TMP24:%.*]] = load volatile i32, ptr addrspace(3) [[TMP21]], align 4
-// CHECK-64-NEXT:    store i32 [[TMP24]], ptr [[TMP23]], align 4
+// CHECK-64-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage_4, i64 0, i32 [[TMP2]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP5]], i64 0, i64 1
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP21]], align 8
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, ptr addrspace(3) [[TMP20]], align 4
+// CHECK-64-NEXT:    store i32 [[TMP23]], ptr [[TMP22]], align 4
 // CHECK-64-NEXT:    br label [[IFCONT12:%.*]]
 // CHECK-64:       else11:
 // CHECK-64-NEXT:    br label [[IFCONT12]]
@@ -707,65 +688,62 @@ int bar(int n){
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK-64-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
+// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP3]], 31
 // CHECK-64-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
-// CHECK-64-NEXT:    [[NVPTX_LANE_ID:%.*]] = and...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/70981


More information about the cfe-commits mailing list