[clang] 0ae9dac - [OpenACC][CIR] Lower 'num_workers' for parallel/kernels (#136578)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Apr 21 11:51:30 PDT 2025
Author: Erich Keane
Date: 2025-04-21T11:51:26-07:00
New Revision: 0ae9dac262afccea1e1a2e02520f737ab38b286c
URL: https://github.com/llvm/llvm-project/commit/0ae9dac262afccea1e1a2e02520f737ab38b286c
DIFF: https://github.com/llvm/llvm-project/commit/0ae9dac262afccea1e1a2e02520f737ab38b286c.diff
LOG: [OpenACC][CIR] Lower 'num_workers' for parallel/kernels (#136578)
This patch also includes the first one to handle 'device_type' properly,
which is where most of the 'challenge' here comes from.
>From the best I can tell: we must keep two lists of the same size, 1 of
all of the 'num_workers' items, and 1 of the 'device_type' value for
that 'num_workers'. Additionally, the 'device_type' list can only handle
single 'device_type' values, so we have to duplicate the 'num_workers'
items in cases where there are multiple applicable 'device_type' values.
This patch accomplishes this by keeping the two in sync, and saving the
current 'device_type' in the visitor.
Added:
Modified:
clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
clang/test/CIR/CodeGenOpenACC/kernels.c
clang/test/CIR/CodeGenOpenACC/parallel.c
Removed:
################################################################################
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index 30e840cbfa1f7..604fdf369860e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -46,10 +46,27 @@ class OpenACCClauseCIREmitter final
// diagnostics are gone.
SourceLocation dirLoc;
+ const OpenACCDeviceTypeClause *lastDeviceTypeClause = nullptr;
+
void clauseNotImplemented(const OpenACCClause &c) {
cgf.cgm.errorNYI(c.getSourceRange(), "OpenACC Clause", c.getClauseKind());
}
+ mlir::Value createIntExpr(const Expr *intExpr) {
+ mlir::Value expr = cgf.emitScalarExpr(intExpr);
+ mlir::Location exprLoc = cgf.cgm.getLoc(intExpr->getBeginLoc());
+
+ mlir::IntegerType targetType = mlir::IntegerType::get(
+ &cgf.getMLIRContext(), cgf.getContext().getIntWidth(intExpr->getType()),
+ intExpr->getType()->isSignedIntegerOrEnumerationType()
+ ? mlir::IntegerType::SignednessSemantics::Signed
+ : mlir::IntegerType::SignednessSemantics::Unsigned);
+
+ auto conversionOp = builder.create<mlir::UnrealizedConversionCastOp>(
+ exprLoc, targetType, expr);
+ return conversionOp.getResult(0);
+ }
+
// 'condition' as an OpenACC grammar production is used for 'if' and (some
// variants of) 'self'. It needs to be emitted as a signless-1-bit value, so
// this function emits the expression, then sets the unrealized conversion
@@ -109,6 +126,7 @@ class OpenACCClauseCIREmitter final
}
void VisitDeviceTypeClause(const OpenACCDeviceTypeClause &clause) {
+ lastDeviceTypeClause = &clause;
if constexpr (isOneOfTypes<OpTy, InitOp, ShutdownOp>) {
llvm::SmallVector<mlir::Attribute> deviceTypes;
std::optional<mlir::ArrayAttr> existingDeviceTypes =
@@ -116,7 +134,7 @@ class OpenACCClauseCIREmitter final
// Ensure we keep the existing ones, and in the correct 'new' order.
if (existingDeviceTypes) {
- for (const mlir::Attribute &Attr : *existingDeviceTypes)
+ for (mlir::Attribute Attr : *existingDeviceTypes)
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
builder.getContext(),
cast<mlir::acc::DeviceTypeAttr>(Attr).getValue()));
@@ -136,6 +154,51 @@ class OpenACCClauseCIREmitter final
if (!clause.getArchitectures().empty())
operation.setDeviceType(
decodeDeviceType(clause.getArchitectures()[0].getIdentifierInfo()));
+ } else if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
+ // Nothing to do here, these constructs don't have any IR for these, as
+ // they just modify the other clauses IR. So setting of `lastDeviceType`
+ // (done above) is all we need.
+ } else {
+ return clauseNotImplemented(clause);
+ }
+ }
+
+ void VisitNumWorkersClause(const OpenACCNumWorkersClause &clause) {
+ if constexpr (isOneOfTypes<OpTy, ParallelOp, KernelsOp>) {
+ // Collect the 'existing' device-type attributes so we can re-create them
+ // and insert them.
+ llvm::SmallVector<mlir::Attribute> deviceTypes;
+ mlir::ArrayAttr existingDeviceTypes =
+ operation.getNumWorkersDeviceTypeAttr();
+
+ if (existingDeviceTypes) {
+ for (mlir::Attribute Attr : existingDeviceTypes)
+ deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+ builder.getContext(),
+ cast<mlir::acc::DeviceTypeAttr>(Attr).getValue()));
+ }
+
+ // Insert 1 version of the 'int-expr' to the NumWorkers list per-current
+ // device type.
+ mlir::Value intExpr = createIntExpr(clause.getIntExpr());
+ if (lastDeviceTypeClause) {
+ for (const DeviceTypeArgument &arg :
+ lastDeviceTypeClause->getArchitectures()) {
+ deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+ builder.getContext(), decodeDeviceType(arg.getIdentifierInfo())));
+ operation.getNumWorkersMutable().append(intExpr);
+ }
+ } else {
+ // Else, we just add a single for 'none'.
+ deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+ builder.getContext(), mlir::acc::DeviceType::None));
+ operation.getNumWorkersMutable().append(intExpr);
+ }
+
+ operation.setNumWorkersDeviceTypeAttr(
+ mlir::ArrayAttr::get(builder.getContext(), deviceTypes));
+ } else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
+ llvm_unreachable("num_workers not valid on serial");
} else {
return clauseNotImplemented(clause);
}
diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c
index ca5bfebcb4ff3..6459b310546cd 100644
--- a/clang/test/CIR/CodeGenOpenACC/kernels.c
+++ b/clang/test/CIR/CodeGenOpenACC/kernels.c
@@ -106,5 +106,57 @@ void acc_kernels(int cond) {
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc
+#pragma acc kernels num_workers(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2u)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
+ // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
+ // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+ // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(3)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+ // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>], %[[THREE_CAST]] : si32 [#acc.device_type<multicore>]) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3)
+ {}
+ // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+ // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } loc
+
// CHECK-NEXT: cir.return
}
diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c
index 3fb0b987409db..bdb506ee7e1d2 100644
--- a/clang/test/CIR/CodeGenOpenACC/parallel.c
+++ b/clang/test/CIR/CodeGenOpenACC/parallel.c
@@ -105,5 +105,57 @@ void acc_parallel(int cond) {
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
+#pragma acc parallel num_workers(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2u)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
+ // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
+ // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc parallel num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+ // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(4)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+ // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i
+ // CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[FOUR_CAST]] : si32 [#acc.device_type<radeon>], %[[FOUR_CAST]] : si32 [#acc.device_type<multicore>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc parallel device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3)
+ {}
+ // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+ // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
// CHECK-NEXT: cir.return
}
More information about the cfe-commits
mailing list