[clang] 514c033 - [OpenMP] Detect SPMD compatible kernels and execute them as such
Johannes Doerfert via cfe-commits
cfe-commits at lists.llvm.org
Sat Jul 10 16:56:26 PDT 2021
Author: Johannes Doerfert
Date: 2021-07-10T18:44:25-05:00
New Revision: 514c033db1e0c237eccd56b9fc11fe05a6baff39
URL: https://github.com/llvm/llvm-project/commit/514c033db1e0c237eccd56b9fc11fe05a6baff39
DIFF: https://github.com/llvm/llvm-project/commit/514c033db1e0c237eccd56b9fc11fe05a6baff39.diff
LOG: [OpenMP] Detect SPMD compatible kernels and execute them as such
In the spirit of TRegions [0], this patch analyzes a kernel and tracks
if it can be executed in SPMD-mode. If so, we flip the arguments of
the __kmpc_target_init and deinit call to enable the mode. We also
update the `<kernel>_exec_mode` flag to indicate to the runtime we
changed the mode to SPMD.
The code analysis is done interprocedurally by extending the
AAKernelInfo abstract attribute to track SPMD compatibility as well.
[0] https://link.springer.com/chapter/10.1007/978-3-030-28596-8_11
Differential Revision: https://reviews.llvm.org/D102307
Added:
llvm/test/Transforms/OpenMP/spmdization.ll
llvm/test/Transforms/OpenMP/spmdization_remarks.ll
Modified:
clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
llvm/lib/IR/Assumptions.cpp
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
llvm/test/Transforms/OpenMP/custom_state_machines.ll
llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
Removed:
################################################################################
diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
index d50311f2d8692..20142d944f362 100644
--- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
+++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
@@ -4,6 +4,8 @@
// host-no-diagnostics
+void baz(void) __attribute__((assume("omp_no_openmp")));
+
void bar1(void) {
#pragma omp parallel // #0
// all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
@@ -27,6 +29,7 @@ void foo1(void) {
// all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}}
// all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: __omp_offloading}}
{
+ baz(); // all-remark {{Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function '_Z3bazv'.}}
#pragma omp parallel // #3
// all-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
// all-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}}
@@ -47,6 +50,7 @@ void foo2(void) {
// all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__5_wrapper, kernel ID: __omp_offloading}}
// all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__4_wrapper, kernel ID: __omp_offloading}}
{
+ baz(); // all-remark {{Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function '_Z3bazv'.}}
#pragma omp parallel // #6
// all-remark@#6 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
// all-remark@#6 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__4_wrapper, kernel ID: __omp_offloading}}
@@ -70,6 +74,7 @@ void foo3(void) {
// all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading}}
// all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__8_wrapper, kernel ID: __omp_offloading}}
{
+ baz(); // all-remark {{Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function '_Z3bazv'.}}
#pragma omp parallel // #9
// all-remark@#9 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
// all-remark@#9 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading}}
diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
index cc656b78d8e83..b461000dade4d 100644
--- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
+++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@@ -4,6 +4,8 @@
// host-no-diagnostics
+void baz(void) __attribute__((assume("omp_no_openmp")));
+
void bar(void) {
#pragma omp parallel // #1 \
// expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
@@ -18,6 +20,7 @@ void foo(void) {
// expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}} \
// expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: __omp_offloading}}
{
+ baz(); // expected-remark {{Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function '_Z3bazv'.}}
#pragma omp parallel // #3 \
// expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
// expected-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading}}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index a05aa231eb516..d174cc8992ddb 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -111,7 +111,10 @@ inline std::string getAllAssumeClauseOptions() {
/// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
/// To complete this, more enum values will need to be moved here.
enum class OMPScheduleType {
+ StaticChunked = 33,
Static = 34, // static unspecialized
+ DistributeChunked = 91,
+ Distribute = 92,
DynamicChunked = 35,
GuidedChunked = 36, // guided unspecialized
Runtime = 37,
diff --git a/llvm/lib/IR/Assumptions.cpp b/llvm/lib/IR/Assumptions.cpp
index 1bd8b7f51e676..6498114cd60d5 100644
--- a/llvm/lib/IR/Assumptions.cpp
+++ b/llvm/lib/IR/Assumptions.cpp
@@ -33,4 +33,5 @@ StringSet<> llvm::KnownAssumptionStrings({
"omp_no_openmp", // OpenMP 5.1
"omp_no_openmp_routines", // OpenMP 5.1
"omp_no_parallelism", // OpenMP 5.1
+ "ompx_spmd_amenable", // OpenMPOpt extension
});
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index ceb2d8295c23a..b30c33c43f652 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -27,6 +27,7 @@
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
#include "llvm/IR/Assumptions.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/InitializePasses.h"
@@ -73,6 +74,9 @@ STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
"Number of OpenMP runtime function uses identified");
STATISTIC(NumOpenMPTargetRegionKernels,
"Number of OpenMP target region entry points (=kernels) identified");
+STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
+ "Number of OpenMP target region entry points (=kernels) executed in "
+ "SPMD-mode instead of generic-mode");
STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
"Number of OpenMP target region entry points (=kernels) executed in "
"generic-mode without a state machines");
@@ -481,6 +485,10 @@ struct KernelInfoState : AbstractState {
/// State to track what parallel region we might reach.
BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
+ /// State to track if we are in SPMD-mode, assumed or know, and why we decided
+ /// we cannot be.
+ BooleanStateWithPtrSetVector<Instruction> SPMDCompatibilityTracker;
+
/// The __kmpc_target_init call in this kernel, if any. If we find more than
/// one we abort as the kernel is malformed.
CallBase *KernelInitCB = nullptr;
@@ -507,6 +515,7 @@ struct KernelInfoState : AbstractState {
/// See AbstractState::indicatePessimisticFixpoint(...)
ChangeStatus indicatePessimisticFixpoint() override {
IsAtFixpoint = true;
+ SPMDCompatibilityTracker.indicatePessimisticFixpoint();
ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
return ChangeStatus::CHANGED;
}
@@ -522,6 +531,8 @@ struct KernelInfoState : AbstractState {
const KernelInfoState &getAssumed() const { return *this; }
bool operator==(const KernelInfoState &RHS) const {
+ if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
+ return false;
if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
return false;
if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
@@ -552,6 +563,7 @@ struct KernelInfoState : AbstractState {
indicatePessimisticFixpoint();
KernelDeinitCB = KIS.KernelDeinitCB;
}
+ SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
return *this;
@@ -2669,8 +2681,10 @@ struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
const std::string getAsStr() const override {
if (!isValidState())
return "<invalid>";
- return
-
+ return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
+ : "generic") +
+ std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
+ : "") +
std::string(" #PRs: ") +
std::to_string(ReachedKnownParallelRegions.size()) +
", #Unknown PRs: " +
@@ -2745,8 +2759,9 @@ struct AAKernelInfoFunction : AAKernelInfo {
assert((KernelInitCB && KernelDeinitCB) &&
"Kernel without __kmpc_target_init or __kmpc_target_deinit!");
- // For kernels we need to register a simplification callback so that the Attributor
- // knows the constant arguments to ___kmpc_target_init and
+ // For kernels we might need to initialize/finalize the IsSPMD state and
+ // we need to register a simplification callback so that the Attributor
+ // knows the constant arguments to __kmpc_target_init and
// __kmpc_target_deinit might actually change.
Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
@@ -2767,10 +2782,45 @@ struct AAKernelInfoFunction : AAKernelInfo {
return FalseVal;
};
+ Attributor::SimplifictionCallbackTy IsSPMDModeSimplifyCB =
+ [&](const IRPosition &IRP, const AbstractAttribute *AA,
+ bool &UsedAssumedInformation) -> Optional<Value *> {
+ // IRP represents the "SPMDCompatibilityTracker" argument of an
+ // __kmpc_target_init or
+ // __kmpc_target_deinit call. We will answer this one with the internal
+ // state.
+ if (!isValidState())
+ return nullptr;
+ if (!SPMDCompatibilityTracker.isAtFixpoint()) {
+ if (AA)
+ A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
+ UsedAssumedInformation = true;
+ } else {
+ UsedAssumedInformation = false;
+ }
+ auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(),
+ SPMDCompatibilityTracker.isAssumed());
+ return Val;
+ };
+
+ constexpr const int InitIsSPMDArgNo = 1;
+ constexpr const int DeinitIsSPMDArgNo = 1;
constexpr const int InitUseStateMachineArgNo = 2;
A.registerSimplificationCallback(
IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
StateMachineSimplifyCB);
+ A.registerSimplificationCallback(
+ IRPosition::callsite_argument(*KernelInitCB, InitIsSPMDArgNo),
+ IsSPMDModeSimplifyCB);
+ A.registerSimplificationCallback(
+ IRPosition::callsite_argument(*KernelDeinitCB, DeinitIsSPMDArgNo),
+ IsSPMDModeSimplifyCB);
+
+ // Check if we know we are in SPMD-mode already.
+ ConstantInt *IsSPMDArg =
+ dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
+ if (IsSPMDArg && !IsSPMDArg->isZero())
+ SPMDCompatibilityTracker.indicateOptimisticFixpoint();
}
/// Modify the IR based on the KernelInfoState as the fixpoint iteration is
@@ -2781,11 +2831,81 @@ struct AAKernelInfoFunction : AAKernelInfo {
if (!KernelInitCB || !KernelDeinitCB)
return ChangeStatus::UNCHANGED;
- buildCustomStateMachine(A);
+ // Known SPMD-mode kernels need no manifest changes.
+ if (SPMDCompatibilityTracker.isKnown())
+ return ChangeStatus::UNCHANGED;
+
+ // If we can we change the execution mode to SPMD-mode otherwise we build a
+ // custom state machine.
+ if (!changeToSPMDMode(A))
+ buildCustomStateMachine(A);
return ChangeStatus::CHANGED;
}
+ bool changeToSPMDMode(Attributor &A) {
+ if (!SPMDCompatibilityTracker.isAssumed()) {
+ for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
+ if (!NonCompatibleI)
+ continue;
+ auto Remark = [&](OptimizationRemarkAnalysis ORA) {
+ ORA << "Kernel will be executed in generic-mode due to this "
+ "potential side-effect";
+ if (auto *CI = dyn_cast<CallBase>(NonCompatibleI)) {
+ if (Function *F = CI->getCalledFunction())
+ ORA << ", consider to add "
+ "`__attribute__((assume(\"ompx_spmd_amenable\")))`"
+ " to the called function '"
+ << F->getName() << "'";
+ }
+ return ORA << ".";
+ };
+ A.emitRemark<OptimizationRemarkAnalysis>(
+ NonCompatibleI, "OpenMPKernelNonSPMDMode", Remark);
+
+ LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "
+ << *NonCompatibleI << "\n");
+ }
+
+ return false;
+ }
+
+ // Adjust the global exec mode flag that tells the runtime what mode this
+ // kernel is executed in.
+ Function *Kernel = getAnchorScope();
+ GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
+ (Kernel->getName() + "_exec_mode").str());
+ assert(ExecMode && "Kernel without exec mode?");
+ assert(ExecMode->getInitializer() &&
+ ExecMode->getInitializer()->isOneValue() &&
+ "Initially non-SPMD kernel has SPMD exec mode!");
+ ExecMode->setInitializer(
+ ConstantInt::get(ExecMode->getInitializer()->getType(), 0));
+
+ // Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
+ const int InitIsSPMDArgNo = 1;
+ const int DeinitIsSPMDArgNo = 1;
+ const int InitUseStateMachineArgNo = 2;
+
+ auto &Ctx = getAnchorValue().getContext();
+ A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo),
+ *ConstantInt::getBool(Ctx, 1));
+ A.changeUseAfterManifest(
+ KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
+ *ConstantInt::getBool(Ctx, 0));
+ A.changeUseAfterManifest(
+ KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo),
+ *ConstantInt::getBool(Ctx, 1));
+ ++NumOpenMPTargetRegionKernelsSPMD;
+
+ auto Remark = [&](OptimizationRemark OR) {
+ return OR << "Generic-mode kernel is changed to SPMD-mode.";
+ };
+ A.emitRemark<OptimizationRemark>(KernelInitCB, "OpenMPKernelSPMDMode",
+ Remark);
+ return true;
+ };
+
ChangeStatus buildCustomStateMachine(Attributor &A) {
assert(ReachedKnownParallelRegions.isValidState() &&
"Custom state machine with invalid parallel region states?");
@@ -2809,7 +2929,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
!IsSPMD->isZero())
return ChangeStatus::UNCHANGED;
- // First, indicate we use a custom state machine now.
+ // If not SPMD mode, indicate we use a custom state machine now.
auto &Ctx = getAnchorValue().getContext();
auto *FalseVal = ConstantInt::getBool(Ctx, 0);
A.changeUseAfterManifest(
@@ -3064,6 +3184,28 @@ struct AAKernelInfoFunction : AAKernelInfo {
ChangeStatus updateImpl(Attributor &A) override {
KernelInfoState StateBefore = getState();
+ // Callback to check a read/write instruction.
+ auto CheckRWInst = [&](Instruction &I) {
+ // We handle calls later.
+ if (isa<CallBase>(I))
+ return true;
+ // We only care about write effects.
+ if (!I.mayWriteToMemory())
+ return true;
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ SmallVector<const Value *> Objects;
+ getUnderlyingObjects(SI->getPointerOperand(), Objects);
+ if (llvm::all_of(Objects,
+ [](const Value *Obj) { return isa<AllocaInst>(Obj); }))
+ return true;
+ }
+ // For now we give up on everything but stores.
+ SPMDCompatibilityTracker.insert(&I);
+ return true;
+ };
+ if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+ SPMDCompatibilityTracker.indicatePessimisticFixpoint();
+
// Callback to check a call instruction.
auto CheckCallInst = [&](Instruction &I) {
auto &CB = cast<CallBase>(I);
@@ -3101,6 +3243,10 @@ struct AAKernelInfoCallSite : AAKernelInfo {
return Fn && hasAssumption(*Fn, AssumptionStr);
};
+ // Check for SPMD-mode assumptions.
+ if (HasAssumption(Callee, "ompx_spmd_amenable"))
+ SPMDCompatibilityTracker.indicateOptimisticFixpoint();
+
// First weed out calls we do not care about, that is readonly/readnone
// calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
// parallel region or anything else we are looking for.
@@ -3125,6 +3271,11 @@ struct AAKernelInfoCallSite : AAKernelInfo {
HasAssumption(Callee, "omp_no_parallelism")))
ReachedUnknownParallelRegions.insert(&CB);
+ // If SPMDCompatibilityTracker is not fixed, we need to give up on the
+ // idea we can run something unknown in SPMD-mode.
+ if (!SPMDCompatibilityTracker.isAtFixpoint())
+ SPMDCompatibilityTracker.insert(&CB);
+
// We have updated the state for this unknown call properly, there won't
// be any change so we indicate a fixpoint.
indicateOptimisticFixpoint();
@@ -3137,6 +3288,37 @@ struct AAKernelInfoCallSite : AAKernelInfo {
const unsigned int WrapperFunctionArgNo = 6;
RuntimeFunction RF = It->getSecond();
switch (RF) {
+ // All the functions we know are compatible with SPMD mode.
+ case OMPRTL___kmpc_is_spmd_exec_mode:
+ case OMPRTL___kmpc_for_static_fini:
+ case OMPRTL___kmpc_global_thread_num:
+ case OMPRTL___kmpc_single:
+ case OMPRTL___kmpc_end_single:
+ case OMPRTL___kmpc_master:
+ case OMPRTL___kmpc_end_master:
+ case OMPRTL___kmpc_barrier:
+ break;
+ case OMPRTL___kmpc_for_static_init_4:
+ case OMPRTL___kmpc_for_static_init_4u:
+ case OMPRTL___kmpc_for_static_init_8:
+ case OMPRTL___kmpc_for_static_init_8u: {
+ // Check the schedule and allow static schedule in SPMD mode.
+ unsigned ScheduleArgOpNo = 2;
+ auto *ScheduleTypeCI =
+ dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo));
+ unsigned ScheduleTypeVal =
+ ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
+ switch (OMPScheduleType(ScheduleTypeVal)) {
+ case OMPScheduleType::Static:
+ case OMPScheduleType::StaticChunked:
+ case OMPScheduleType::Distribute:
+ case OMPScheduleType::DistributeChunked:
+ break;
+ default:
+ SPMDCompatibilityTracker.insert(&CB);
+ break;
+ };
+ } break;
case OMPRTL___kmpc_target_init:
KernelInitCB = &CB;
break;
@@ -3156,9 +3338,13 @@ struct AAKernelInfoCallSite : AAKernelInfo {
break;
case OMPRTL___kmpc_omp_task:
// We do not look into tasks right now, just give up.
+ SPMDCompatibilityTracker.insert(&CB);
ReachedUnknownParallelRegions.insert(&CB);
break;
default:
+ // Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
+ // generally.
+ SPMDCompatibilityTracker.insert(&CB);
break;
}
// All other OpenMP runtime calls will not reach parallel regions so they
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index b352255a1b62f..f2e5e59ad5dcb 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -1526,51 +1526,17 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_389eb_simple_state_machine_pure_l72
; CHECK-SAME: () #[[ATTR0]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8
; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true)
-; CHECK-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; CHECK-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; CHECK: worker_state_machine.begin:
-; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
-; CHECK-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]])
-; CHECK-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8
-; CHECK-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)*
-; CHECK-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null
-; CHECK-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; CHECK: worker_state_machine.finished:
-; CHECK-NEXT: ret void
-; CHECK: worker_state_machine.is_active.check:
-; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; CHECK: worker_state_machine.parallel_region.check:
-; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__13_wrapper
-; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
-; CHECK: worker_state_machine.parallel_region.execute:
-; CHECK-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]])
-; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; CHECK: worker_state_machine.parallel_region.check1:
-; CHECK-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]]
-; CHECK: worker_state_machine.parallel_region.execute2:
-; CHECK-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]])
-; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; CHECK: worker_state_machine.parallel_region.check3:
-; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; CHECK: worker_state_machine.parallel_region.end:
-; CHECK-NEXT: call void @__kmpc_kernel_end_parallel()
-; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; CHECK: worker_state_machine.done.barrier:
-; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]])
-; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]]
-; CHECK: thread.user_code.check:
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noalias noundef nonnull readnone align 8 dereferenceable(24) @[[GLOB1]], i1 noundef true, i1 noundef false, i1 noundef true)
; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
; CHECK: user_code.entry:
; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2]]
; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
; CHECK-NEXT: call void @__omp_outlined__12(i32* noundef nonnull align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noundef nonnull align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
-; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true)
+; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
; CHECK-NEXT: ret void
; CHECK: worker.exit:
; CHECK-NEXT: ret void
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
index d7383f853519c..167fb0da0af39 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
@@ -22,13 +22,15 @@ target triple = "nvptx64"
;; unknown();
;; }
;; }
-;;
+;;
+;; void no_openmp(void) __attribute__((assume("omp_no_openmp")));
;; void test_no_fallback(void) {
;; #pragma omp target teams
;; {
;; known();
;; known();
;; known();
+;; no_openmp(); // make it non-spmd
;; }
;; }
@@ -50,6 +52,7 @@ target triple = "nvptx64"
@__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1
@12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1
@13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([73 x i8], [73 x i8]* @12, i32 0, i32 0) }, align 8
+ at G = external global i32
@llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode, i8* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode], section "llvm.metadata"
; Function Attrs: convergent norecurse nounwind
@@ -124,6 +127,8 @@ user_code.entry: ; preds = %entry
%6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %6, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !43
call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45
+ call void @no_openmp()
+ call void @no_parallelism()
call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i1 false, i1 true) #3, !dbg !46
br label %common.ret
}
@@ -154,6 +159,9 @@ declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
+declare void @no_openmp() #7
+declare void @no_parallelism() #8
+
attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
@@ -161,6 +169,8 @@ attributes #3 = { nounwind }
attributes #4 = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
attributes #5 = { argmemonly nofree nosync nounwind willreturn }
attributes #6 = { convergent nounwind }
+attributes #7 = { "llvm.assume"="omp_no_openmp" }
+attributes #8 = { "llvm.assume"="omp_no_parallelism" }
!llvm.dbg.cu = !{!0}
!omp_offload.info = !{!3, !4}
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
new file mode 100644
index 0000000000000..6ecda643acdbc
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s
+
+;; void unknown(void);
+;; void spmd_amenable(void) __attribute__((assume("ompx_spmd_amenable")))
+;;
+;; void sequential_loop() {
+;; #pragma omp target teams
+;; {
+;; for (int i = 0; i < 100; ++i) {
+;; #pragma omp parallel
+;; {
+;; unknown();
+;; }
+;; }
+; spmd_amenable();
+;; }
+;; }
+
+target triple = "nvptx64"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+ at 0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+ at 1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+ at __omp_offloading_2c_38c77_sequential_loop_l4_exec_mode = weak constant i8 1
+ at llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata"
+
+; The second argument of __kmpc_target_init and deinit is is set to true to indicate that we can run in SPMD mode.
+; We also adjusted the global __omp_offloading_2c_38c77_sequential_loop_l4_exec_mode to have a zero initializer (which indicates SPMD mode to the runtime).
+;.
+; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8
+; CHECK: @[[__OMP_OFFLOADING_2C_38C77_SEQUENTIAL_LOOP_L4_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 0
+; CHECK: @[[LLVM_COMPILER_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [1 x i8*] [i8* @__omp_offloading_2c_38c77_sequential_loop_l4_exec_mode], section "llvm.metadata"
+;.
+define weak void @__omp_offloading_2c_38c77_sequential_loop_l4() #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2c_38c77_sequential_loop_l4
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true)
+; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
+; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+; CHECK: user_code.entry:
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+; CHECK-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true)
+; CHECK-NEXT: ret void
+; CHECK: worker.exit:
+; CHECK-NEXT: ret void
+;
+entry:
+ %.zero.addr = alloca i32, align 4
+ %.threadid_temp. = alloca i32, align 4
+ store i32 0, i32* %.zero.addr, align 4
+ %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
+ %exec_user_code = icmp eq i32 %0, -1
+ br i1 %exec_user_code, label %user_code.entry, label %worker.exit
+
+user_code.entry: ; preds = %entry
+ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+ store i32 %1, i32* %.threadid_temp., align 4
+ call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #2
+ call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
+ ret void
+
+worker.exit: ; preds = %entry
+ ret void
+}
+
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
+; CHECK-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
+; CHECK-NEXT: br label [[FOR_COND:%.*]]
+; CHECK: for.cond:
+; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
+; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT: br label [[FOR_INC]]
+; CHECK: for.inc:
+; CHECK-NEXT: [[INC]] = add nsw i32 [[I_0]], 1
+; CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK: for.end:
+; CHECK-NEXT: call void @spmd_amenable()
+; CHECK-NEXT: ret void
+;
+entry:
+ %captured_vars_addrs = alloca [0 x i8*], align 8
+ br label %for.cond
+
+for.cond: ; preds = %for.inc, %entry
+ %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+ %cmp = icmp slt i32 %i.0, 100
+ br i1 %cmp, label %for.body, label %for.end
+
+for.body: ; preds = %for.cond
+ %0 = load i32, i32* %.global_tid., align 4
+ %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
+ call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %1, i64 0)
+ br label %for.inc
+
+for.inc: ; preds = %for.body
+ %inc = add nsw i32 %i.0, 1
+ br label %for.cond, !llvm.loop !6
+
+for.end: ; preds = %for.cond
+ call void @spmd_amenable()
+ ret void
+}
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1
+; CHECK-SAME: (i32* noalias nocapture nofree readnone [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call void @unknown() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT: ret void
+;
+entry:
+ call void @unknown() #3
+ ret void
+}
+
+; Function Attrs: convergent
+declare void @unknown() #1
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #0 {
+; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
+; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
+; CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
+; CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
+; CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
+; CHECK-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR2]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %.addr1 = alloca i32, align 4
+ %.zero.addr = alloca i32, align 4
+ %global_args = alloca i8**, align 8
+ store i32 0, i32* %.zero.addr, align 4
+ store i32 %1, i32* %.addr1, align 4
+ call void @__kmpc_get_shared_variables(i8*** %global_args)
+ call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) #2
+ ret void
+}
+
+declare void @__kmpc_get_shared_variables(i8***)
+
+declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)
+
+; Function Attrs: nounwind
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #2
+
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
+
+declare void @spmd_amenable() #4
+
+attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #2 = { nounwind }
+attributes #3 = { convergent }
+attributes #4 = { "llvm.assume"="ompx_spmd_amenable" }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3, !4, !8, !9}
+!llvm.ident = !{!5}
+
+!0 = !{i32 0, i32 44, i32 232567, !"sequential_loop", i32 4, i32 0}
+!1 = !{void ()* @__omp_offloading_2c_38c77_sequential_loop_l4, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!"clang version 13.0.0"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.mustprogress"}
+!8 = !{i32 7, !"openmp", i32 50}
+!9 = !{i32 7, !"openmp-device", i32 50}
+;.
+; CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+; CHECK: attributes #[[ATTR2]] = { nounwind }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" }
+; CHECK: attributes #[[ATTR4]] = { convergent }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 44, i32 232567, !"sequential_loop", i32 4, i32 0}
+; CHECK: [[META1:![0-9]+]] = !{void ()* @__omp_offloading_2c_38c77_sequential_loop_l4, !"kernel", i32 1}
+; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"PIC Level", i32 2}
+; CHECK: [[META4:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META7:![0-9]+]] = !{!"clang version 13.0.0"}
+; CHECK: [[LOOP8]] = distinct !{!8, !9}
+; CHECK: [[META9:![0-9]+]] = !{!"llvm.loop.mustprogress"}
+;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
new file mode 100644
index 0000000000000..188b84d5c1187
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
@@ -0,0 +1,233 @@
+; RUN: opt -passes=openmp-opt -pass-remarks=openmp-opt -pass-remarks-missed=openmp-opt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
+target triple = "nvptx64"
+
+; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function 'unknown'.
+; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: Kernel will be executed in generic-mode due to this potential side-effect, consider to add `__attribute__((assume("ompx_spmd_amenable")))` to the called function 'unknown'.
+; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:11:1: Generic-mode kernel is executed with a customized state machine that requires a fallback [1 known parallel regions, 2 unkown parallel regions] (bad).
+; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:13:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp")))` (or "omp_no_parallelism").
+; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:15:5: State machine fallback caused by this call. If it is a false positive, use `__attribute__((assume("omp_no_openmp")))` (or "omp_no_parallelism").
+; CHECK: remark: llvm/test/Transforms/OpenMP/spmdization_remarks.c:20:1: Generic-mode kernel is changed to SPMD-mode.
+
+;; void unknown(void);
+;; void known(void) {
+;; #pragma omp parallel
+;; {
+;; unknown();
+;; }
+;; }
+;;
+;; void test_fallback(void) {
+;; #pragma omp target teams
+;; {
+;; unknown();
+;; known();
+;; unknown();
+;; }
+;; }
+;;
+;; void no_openmp(void) __attribute__((assume("omp_no_openmp")));
+;; void test_no_fallback(void) {
+;; #pragma omp target teams
+;; {
+;; known();
+;; known();
+;; known();
+;; spmd_amenable();
+;; }
+;; }
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+ at 0 = private unnamed_addr constant [103 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
+ at 1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([103 x i8], [103 x i8]* @0, i32 0, i32 0) }, align 8
+ at 2 = private unnamed_addr constant [72 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;test_fallback;11;1;;\00", align 1
+ at 3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([72 x i8], [72 x i8]* @2, i32 0, i32 0) }, align 8
+ at 4 = private unnamed_addr constant [104 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;25;;\00", align 1
+ at 5 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([104 x i8], [104 x i8]* @4, i32 0, i32 0) }, align 8
+ at __omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode = weak constant i8 1
+ at 6 = private unnamed_addr constant [106 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;1;;\00", align 1
+ at 7 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([106 x i8], [106 x i8]* @6, i32 0, i32 0) }, align 8
+ at 8 = private unnamed_addr constant [75 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;test_no_fallback;20;1;;\00", align 1
+ at 9 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([75 x i8], [75 x i8]* @8, i32 0, i32 0) }, align 8
+ at 10 = private unnamed_addr constant [107 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_no_fallback_l20;20;25;;\00", align 1
+ at 11 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([107 x i8], [107 x i8]* @10, i32 0, i32 0) }, align 8
+ at __omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode = weak constant i8 1
+ at 12 = private unnamed_addr constant [63 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;known;4;1;;\00", align 1
+ at 13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([63 x i8], [63 x i8]* @12, i32 0, i32 0) }, align 8
+ at G = external global i32
+ at llvm.compiler.used = appending global [2 x i8*] [i8* @__omp_offloading_2a_d80d3d_test_fallback_l11_exec_mode, i8* @__omp_offloading_2a_d80d3d_test_no_fallback_l20_exec_mode], section "llvm.metadata"
+
+; Function Attrs: convergent norecurse nounwind
+define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 {
+entry:
+ %captured_vars_addrs.i.i = alloca [0 x i8*], align 8
+ %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) #3, !dbg !18
+ %exec_user_code = icmp eq i32 %0, -1, !dbg !18
+ br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18
+
+common.ret: ; preds = %entry, %user_code.entry
+ ret void, !dbg !19
+
+user_code.entry: ; preds = %entry
+ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @3) #3
+ call void @unknown() #6, !dbg !20
+ %2 = bitcast [0 x i8*]* %captured_vars_addrs.i.i to i8*
+ call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
+ %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
+ %4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i.i, i64 0, i64 0, !dbg !23
+ call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !23
+ call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !26
+ call void @unknown() #6, !dbg !27
+ call void @__kmpc_target_deinit(%struct.ident_t* nonnull @5, i1 false, i1 true) #3, !dbg !28
+ br label %common.ret
+}
+
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) local_unnamed_addr
+
+; Function Attrs: convergent
+declare void @unknown() local_unnamed_addr #1
+
+; Function Attrs: nounwind
+define hidden void @known() local_unnamed_addr #2 !dbg !29 {
+entry:
+ %captured_vars_addrs = alloca [0 x i8*], align 8
+ %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @13)
+ %1 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs, i64 0, i64 0, !dbg !30
+ call void @__kmpc_parallel_51(%struct.ident_t* nonnull @13, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** nonnull %1, i64 0) #3, !dbg !30
+ ret void, !dbg !31
+}
+
+; Function Attrs: nounwind
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #3
+
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) local_unnamed_addr
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 {
+entry:
+ %captured_vars_addrs.i2.i = alloca [0 x i8*], align 8
+ %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @7, i1 false, i1 true, i1 true) #3, !dbg !33
+ %exec_user_code = icmp eq i32 %0, -1, !dbg !33
+ br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33
+
+common.ret: ; preds = %entry, %user_code.entry
+ ret void, !dbg !34
+
+user_code.entry: ; preds = %entry
+ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @9) #3
+ %2 = bitcast [0 x i8*]* %captured_vars_addrs.i2.i to i8*
+ call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
+ %3 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
+ %4 = getelementptr inbounds [0 x i8*], [0 x i8*]* %captured_vars_addrs.i2.i, i64 0, i64 0, !dbg !35
+ call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !35
+ call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !39
+ call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
+ %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
+ call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %5, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !40
+ call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !42
+ call void @llvm.lifetime.start.p0i8(i64 0, i8* nonnull %2) #3
+ %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* noundef nonnull @13) #3
+ call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %6, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !43
+ call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45
+ call void @spmd_amenable()
+ call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i1 false, i1 true) #3, !dbg !46
+ br label %common.ret
+}
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__2(i32* noalias nocapture nofree readnone %.global_tid., i32* noalias nocapture nofree readnone %.bound_tid.) #0 !dbg !47 {
+entry:
+ call void @unknown() #6, !dbg !48
+ ret void, !dbg !49
+}
+
+; Function Attrs: convergent norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) #0 !dbg !50 {
+entry:
+ %global_args = alloca i8**, align 8
+ call void @__kmpc_get_shared_variables(i8*** nonnull %global_args) #3, !dbg !51
+ call void @unknown() #6, !dbg !52
+ ret void, !dbg !51
+}
+
+declare void @__kmpc_get_shared_variables(i8***) local_unnamed_addr
+
+declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) local_unnamed_addr
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #5
+
+; Function Attrs: argmemonly nofree nosync nounwind willreturn
+declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #5
+
+declare void @spmd_amenable() #7
+
+attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #1 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #2 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #3 = { nounwind }
+attributes #4 = { norecurse nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_53" "target-features"="+ptx32,+sm_53" }
+attributes #5 = { argmemonly nofree nosync nounwind willreturn }
+attributes #6 = { convergent nounwind }
+attributes #7 = { "llvm.assume"="ompx_spmd_amenable" }
+
+!llvm.dbg.cu = !{!0}
+!omp_offload.info = !{!3, !4}
+!nvvm.annotations = !{!5, !6}
+!llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13}
+!llvm.ident = !{!14}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "spmdization_remarks.c", directory: "/data/src/llvm-project")
+!2 = !{}
+!3 = !{i32 0, i32 42, i32 14159165, !"test_no_fallback", i32 20, i32 1}
+!4 = !{i32 0, i32 42, i32 14159165, !"test_fallback", i32 11, i32 0}
+!5 = !{void ()* @__omp_offloading_2a_d80d3d_test_fallback_l11, !"kernel", i32 1}
+!6 = !{void ()* @__omp_offloading_2a_d80d3d_test_no_fallback_l20, !"kernel", i32 1}
+!7 = !{i32 7, !"Dwarf Version", i32 2}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"wchar_size", i32 4}
+!10 = !{i32 7, !"openmp", i32 50}
+!11 = !{i32 7, !"openmp-device", i32 50}
+!12 = !{i32 7, !"PIC Level", i32 2}
+!13 = !{i32 7, !"frame-pointer", i32 2}
+!14 = !{!"clang version 13.0.0"}
+!15 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_fallback_l11", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!16 = !DIFile(filename: "llvm/test/Transforms/OpenMP/spmdization_remarks.c", directory: "/data/src/llvm-project")
+!17 = !DISubroutineType(types: !2)
+!18 = !DILocation(line: 11, column: 1, scope: !15)
+!19 = !DILocation(line: 0, scope: !15)
+!20 = !DILocation(line: 13, column: 5, scope: !21, inlinedAt: !22)
+!21 = distinct !DISubprogram(name: "__omp_outlined__", scope: !16, file: !16, line: 11, type: !17, scopeLine: 11, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!22 = distinct !DILocation(line: 11, column: 1, scope: !15)
+!23 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !25)
+!24 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!25 = distinct !DILocation(line: 14, column: 5, scope: !21, inlinedAt: !22)
+!26 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !25)
+!27 = !DILocation(line: 15, column: 5, scope: !21, inlinedAt: !22)
+!28 = !DILocation(line: 11, column: 25, scope: !15)
+!29 = distinct !DISubprogram(name: "known", scope: !16, file: !16, line: 3, type: !17, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!30 = !DILocation(line: 4, column: 1, scope: !29)
+!31 = !DILocation(line: 8, column: 1, scope: !29)
+!32 = distinct !DISubprogram(name: "__omp_offloading_2a_d80d3d_test_no_fallback_l20", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!33 = !DILocation(line: 20, column: 1, scope: !32)
+!34 = !DILocation(line: 0, scope: !32)
+!35 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !36)
+!36 = distinct !DILocation(line: 22, column: 5, scope: !37, inlinedAt: !38)
+!37 = distinct !DISubprogram(name: "__omp_outlined__1", scope: !16, file: !16, line: 20, type: !17, scopeLine: 20, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!38 = distinct !DILocation(line: 20, column: 1, scope: !32)
+!39 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !36)
+!40 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !41)
+!41 = distinct !DILocation(line: 23, column: 5, scope: !37, inlinedAt: !38)
+!42 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !41)
+!43 = !DILocation(line: 4, column: 1, scope: !24, inlinedAt: !44)
+!44 = distinct !DILocation(line: 24, column: 5, scope: !37, inlinedAt: !38)
+!45 = !DILocation(line: 8, column: 1, scope: !24, inlinedAt: !44)
+!46 = !DILocation(line: 20, column: 25, scope: !32)
+!47 = distinct !DISubprogram(name: "__omp_outlined__2", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!48 = !DILocation(line: 6, column: 5, scope: !47)
+!49 = !DILocation(line: 7, column: 3, scope: !47)
+!50 = distinct !DISubprogram(linkageName: "__omp_outlined__2_wrapper", scope: !16, file: !16, line: 4, type: !17, scopeLine: 4, flags: DIFlagArtificial, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!51 = !DILocation(line: 4, column: 1, scope: !50)
+!52 = !DILocation(line: 6, column: 5, scope: !47, inlinedAt: !53)
+!53 = distinct !DILocation(line: 4, column: 1, scope: !50)
More information about the cfe-commits
mailing list