[llvm] 5ab6aed - [OpenMP] Folding threadLimit and numThreads when single value in kernels

Shilei Tian via llvm-commits llvm-commits at lists.llvm.org
Tue Jul 27 18:47:21 PDT 2021


Author: Jose M Monsalve Diaz
Date: 2021-07-27T21:47:12-04:00
New Revision: 5ab6aedda9d959a44453b7163b59f645012dbb83

URL: https://github.com/llvm/llvm-project/commit/5ab6aedda9d959a44453b7163b59f645012dbb83
DIFF: https://github.com/llvm/llvm-project/commit/5ab6aedda9d959a44453b7163b59f645012dbb83.diff

LOG: [OpenMP] Folding threadLimit and numThreads when single value in kernels

The device runtime contains several calls to `__kmpc_get_hardware_num_threads_in_block`
and `__kmpc_get_hardware_num_blocks`. If the thread_limit and the num_teams are constant,
these calls can be folded to the constant value.

In this patch we use the already introduced `AAFoldRuntimeCall` and the `NumTeams` and
`NumThreads` kernel attributes (to be introduced in a different patch) to fold these functions.
The code checks all the kernels, and if their attributes match, the functions are folded.

In the future we will explore specializing for multiple values of NumThreads and NumTeams.

Depends on D106390

Reviewed By: jdoerfert, JonChesterfield

Differential Revision: https://reviews.llvm.org/D106033

Added: 
    llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll

Modified: 
    llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
    llvm/lib/Transforms/IPO/OpenMPOpt.cpp
    openmp/libomptarget/deviceRTLs/target_interface.h

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 8e648a010610f..eb673b199fc41 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -206,6 +206,9 @@ __OMP_RTL(__kmpc_omp_reg_task_with_affinity, false, Int32, IdentPtr, Int32,
           /* kmp_task_t */ VoidPtr, Int32,
           /* kmp_task_affinity_info_t */ VoidPtr)
 
+__OMP_RTL(__kmpc_get_hardware_num_blocks, false, Int32, )
+__OMP_RTL(__kmpc_get_hardware_num_threads_in_block, false, Int32, )
+
 __OMP_RTL(omp_get_thread_num, false, Int32, )
 __OMP_RTL(omp_get_num_threads, false, Int32, )
 __OMP_RTL(omp_get_max_threads, false, Int32, )
@@ -601,6 +604,9 @@ __OMP_RTL_ATTRS(__kmpc_omp_reg_task_with_affinity, DefaultAttrs, AttributeSet(),
                 ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), ReadOnlyPtrAttrs,
                            AttributeSet(), ReadOnlyPtrAttrs))
 
+__OMP_RTL_ATTRS(__kmpc_get_hardware_num_blocks, GetterAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_get_hardware_num_threads_in_block, GetterAttrs, AttributeSet(), ParamAttrs())
+
 __OMP_RTL_ATTRS(omp_get_thread_num, GetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_num_threads, GetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(omp_get_max_threads, GetterAttrs, AttributeSet(), ParamAttrs())

diff  --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 9f98902105cc2..b803493527199 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1833,6 +1833,8 @@ struct OpenMPOpt {
     return Changed == ChangeStatus::CHANGED;
   }
 
+  void registerFoldRuntimeCall(RuntimeFunction RF);
+
   /// Populate the Attributor with abstract attribute opportunities in the
   /// function.
   void registerAAs(bool IsModulePass);
@@ -3506,6 +3508,8 @@ struct AAKernelInfoCallSite : AAKernelInfo {
     case OMPRTL___kmpc_is_spmd_exec_mode:
     case OMPRTL___kmpc_for_static_fini:
     case OMPRTL___kmpc_global_thread_num:
+    case OMPRTL___kmpc_get_hardware_num_threads_in_block:
+    case OMPRTL___kmpc_get_hardware_num_blocks:
     case OMPRTL___kmpc_single:
     case OMPRTL___kmpc_end_single:
     case OMPRTL___kmpc_master:
@@ -3710,7 +3714,6 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
 
   ChangeStatus updateImpl(Attributor &A) override {
     ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
     switch (RFKind) {
     case OMPRTL___kmpc_is_spmd_exec_mode:
       Changed |= foldIsSPMDExecMode(A);
@@ -3721,6 +3724,12 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
     case OMPRTL___kmpc_parallel_level:
       Changed |= foldParallelLevel(A);
       break;
+    case OMPRTL___kmpc_get_hardware_num_threads_in_block:
+      Changed = Changed | foldKernelFnAttribute(A, "omp_target_thread_limit");
+      break;
+    case OMPRTL___kmpc_get_hardware_num_blocks:
+      Changed = Changed | foldKernelFnAttribute(A, "omp_target_num_teams");
+      break;
     default:
       llvm_unreachable("Unhandled OpenMP runtime function!");
     }
@@ -3892,7 +3901,39 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
              "Expected only non-SPMD kernels!");
       SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
     }
+    return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
+                                                    : ChangeStatus::CHANGED;
+  }
+
+  ChangeStatus foldKernelFnAttribute(Attributor &A, llvm::StringRef Attr) {
+    // Specialize only if all the calls agree with the attribute constant value
+    int32_t CurrentAttrValue = -1;
+    Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
+
+    auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
+        *this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
 
+    if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
+      return indicatePessimisticFixpoint();
+
+    // Iterate over the kernels that reach this function
+    for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
+      int32_t NextAttrVal = -1;
+      if (K->hasFnAttribute(Attr))
+        NextAttrVal =
+            std::stoi(K->getFnAttribute(Attr).getValueAsString().str());
+
+      if (NextAttrVal == -1 ||
+          (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
+        return indicatePessimisticFixpoint();
+      CurrentAttrValue = NextAttrVal;
+    }
+
+    if (CurrentAttrValue != -1) {
+      auto &Ctx = getAnchorValue().getContext();
+      SimplifiedValue =
+          ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
+    }
     return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
                                                     : ChangeStatus::CHANGED;
   }
@@ -3908,6 +3949,21 @@ struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
 
 } // namespace
 
+/// Register folding callsite
+void OpenMPOpt::registerFoldRuntimeCall(RuntimeFunction RF) {
+  auto &RFI = OMPInfoCache.RFIs[RF];
+  RFI.foreachUse(SCC, [&](Use &U, Function &F) {
+    CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
+    if (!CI)
+      return false;
+    A.getOrCreateAAFor<AAFoldRuntimeCall>(
+        IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
+        DepClassTy::NONE, /* ForceUpdate */ false,
+        /* UpdateAfterInit */ false);
+    return false;
+  });
+}
+
 void OpenMPOpt::registerAAs(bool IsModulePass) {
   if (SCC.empty())
 
@@ -3923,43 +3979,12 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
           DepClassTy::NONE, /* ForceUpdate */ false,
           /* UpdateAfterInit */ false);
 
-    auto &IsMainRFI =
-        OMPInfoCache.RFIs[OMPRTL___kmpc_is_generic_main_thread_id];
-    IsMainRFI.foreachUse(SCC, [&](Use &U, Function &F) {
-      CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &IsMainRFI);
-      if (!CI)
-        return false;
-      A.getOrCreateAAFor<AAFoldRuntimeCall>(
-          IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
-          DepClassTy::NONE, /* ForceUpdate */ false,
-          /* UpdateAfterInit */ false);
-      return false;
-    });
 
-    auto &IsSPMDRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_is_spmd_exec_mode];
-    IsSPMDRFI.foreachUse(SCC, [&](Use &U, Function &) {
-      CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &IsSPMDRFI);
-      if (!CI)
-        return false;
-      A.getOrCreateAAFor<AAFoldRuntimeCall>(
-          IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
-          DepClassTy::NONE, /* ForceUpdate */ false,
-          /* UpdateAfterInit */ false);
-      return false;
-    });
-
-    auto &ParallelLevelRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_level];
-    ParallelLevelRFI.foreachUse(SCC, [&](Use &U, Function &) {
-      CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &ParallelLevelRFI);
-      if (!CI)
-        return false;
-      A.getOrCreateAAFor<AAFoldRuntimeCall>(
-          IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
-          DepClassTy::NONE, /* ForceUpdate */ false,
-          /* UpdateAfterInit */ false);
-
-      return false;
-    });
+    registerFoldRuntimeCall(OMPRTL___kmpc_is_generic_main_thread_id);
+    registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
+    registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
+    registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
+    registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
   }
 
   // Create CallSite AA for all Getters.

diff  --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
new file mode 100644
index 0000000000000..628e75fd1b4fe
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
+target triple = "nvptx64"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+ at kernel0_exec_mode = weak constant i8 1
+
+ at G = external global i32
+;.
+; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
+;.
+define weak void @kernel0() #0 {
+; CHECK-LABEL: define {{[^@]+}}@kernel0()
+; CHECK: #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+; CHECK-NEXT:    call void @helper0()
+; CHECK-NEXT:    call void @helper1()
+; CHECK-NEXT:    call void @helper2()
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+  call void @helper0()
+  call void @helper1()
+  call void @helper2()
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false)
+  ret void
+}
+
+ at kernel1_exec_mode = weak constant i8 1
+
+define weak void @kernel1() #0 {
+; CHECK-LABEL: define {{[^@]+}}@kernel1()
+; CHECK: #[[ATTR0]] {
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+; CHECK-NEXT:    call void @helper1()
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false)
+  call void @helper1()
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+  ret void
+}
+
+ at kernel2_exec_mode = weak constant i8 1
+
+define weak void @kernel2() #0 {
+; CHECK-LABEL: define {{[^@]+}}@kernel2()
+; CHECK: #[[ATTR0]] {
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
+; CHECK-NEXT:    call void @helper0()
+; CHECK-NEXT:    call void @helper1()
+; CHECK-NEXT:    call void @helper2()
+; CHECK-NEXT:    call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false)
+  call void @helper0()
+  call void @helper1()
+  call void @helper2()
+  call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false)
+  ret void
+}
+
+define internal void @helper0() {
+; CHECK-LABEL: define {{[^@]+}}@helper0() {{#[0-9]+}} {
+; CHECK-NEXT:    store i32 666, i32* @G, align 4
+; CHECK-NEXT:    ret void
+;
+  %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block()
+  store i32 %threadLimit, i32* @G
+  ret void
+}
+
+define internal void @helper1() {
+; CHECK-LABEL: define {{[^@]+}}@helper1() {{#[0-9]+}} {
+; CHECK-NEXT:    br label [[F:%.*]]
+; CHECK:       t:
+; CHECK-NEXT:    unreachable
+; CHECK:       f:
+; CHECK-NEXT:    ret void
+;
+  %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block()
+  %c = icmp eq i32 %threadLimit, 666
+  br i1 %c, label %f, label %t
+t:
+  call void @helper0()
+  ret void
+f:
+  ret void
+}
+
+define internal void @helper2() {
+; CHECK-LABEL: define {{[^@]+}}@helper2() {{#[0-9]+}} {
+; CHECK-NEXT:    store i32 666, i32* @G
+; CHECK-NEXT:    ret void
+;
+  %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block()
+  store i32 %threadLimit, i32* @G
+  ret void
+}
+
+declare i32 @__kmpc_get_hardware_num_threads_in_block()
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1
+declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1
+
+
+!llvm.module.flags = !{!0, !1}
+!nvvm.annotations = !{!2, !3, !4}
+
+attributes #0 = { "omp_target_thread_limit"="666" "omp_target_num_teams"="777"}
+
+!0 = !{i32 7, !"openmp", i32 50}
+!1 = !{i32 7, !"openmp-device", i32 50}
+!2 = !{void ()* @kernel0, !"kernel", i32 1}
+!3 = !{void ()* @kernel1, !"kernel", i32 1}
+!4 = !{void ()* @kernel2, !"kernel", i32 1}
+;.
+; CHECK: attributes #[[ATTR0]] = { "omp_target_num_teams"="777" "omp_target_thread_limit"="666" }
+;
+; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META2:![0-9]+]] = !{void ()* @kernel0, !"kernel", i32 1}
+; CHECK: [[META3:![0-9]+]] = !{void ()* @kernel1, !"kernel", i32 1}
+; CHECK: [[META4:![0-9]+]] = !{void ()* @kernel2, !"kernel", i32 1}
+;.

diff  --git a/openmp/libomptarget/deviceRTLs/target_interface.h b/openmp/libomptarget/deviceRTLs/target_interface.h
index a4961c3b95bbc..c7ac0652e7557 100644
--- a/openmp/libomptarget/deviceRTLs/target_interface.h
+++ b/openmp/libomptarget/deviceRTLs/target_interface.h
@@ -18,8 +18,8 @@
 // Calls to the NVPTX layer (assuming 1D layout)
 EXTERN int __kmpc_get_hardware_thread_id_in_block();
 EXTERN int GetBlockIdInKernel();
-EXTERN int __kmpc_get_hardware_num_blocks();
-EXTERN int __kmpc_get_hardware_num_threads_in_block();
+EXTERN NOINLINE int __kmpc_get_hardware_num_blocks();
+EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block();
 EXTERN unsigned GetWarpId();
 EXTERN unsigned GetWarpSize();
 EXTERN unsigned GetLaneId();


        


More information about the llvm-commits mailing list