[clang] [OpenACC][CIR] Implement 'async' lowering. (PR #136626)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Apr 21 15:24:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang
Author: Erich Keane (erichkeane)
<details>
<summary>Changes</summary>
Async acts just like num_workers/vector_length in that it gets a new variant per device_type and is lowered as an operand.
However, it has one additional complication, in that it can have a variant that has no argument, which produces an attribute with the correct devicetype.
Additionally, this syncronizes us with the implementation of flang,
which prohibits multiple 'async' clauses per-device_type.
---
Full diff: https://github.com/llvm/llvm-project/pull/136626.diff
6 Files Affected:
- (modified) clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp (+33-9)
- (modified) clang/lib/Sema/SemaOpenACCClause.cpp (+3)
- (modified) clang/test/CIR/CodeGenOpenACC/kernels.c (+46)
- (modified) clang/test/CIR/CodeGenOpenACC/parallel.c (+46)
- (modified) clang/test/CIR/CodeGenOpenACC/serial.c (+46)
- (modified) clang/test/SemaOpenACC/compute-construct-async-clause.c (+42)
``````````diff
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index e7dd2e74b0864..82fbb49db3bc8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -97,11 +97,13 @@ class OpenACCClauseCIREmitter final
// Handle a clause affected by the 'device-type' to the point that they need
// to have the attributes added in the correct/corresponding order, such as
- // 'num_workers' or 'vector_length' on a compute construct.
- mlir::ArrayAttr
- handleDeviceTypeAffectedClause(mlir::ArrayAttr existingDeviceTypes,
- mlir::Value argument,
- mlir::MutableOperandRange &argCollection) {
+ // 'num_workers' or 'vector_length' on a compute construct. For cases where we
+ // don't have an argument that needs to be added to an additional one (such as
+ // asyncOnly) we can use this with 'argument' as std::nullopt.
+ mlir::ArrayAttr handleDeviceTypeAffectedClause(
+ mlir::ArrayAttr existingDeviceTypes,
+ std::optional<mlir::Value> argument = std::nullopt,
+ mlir::MutableOperandRange *argCollection = nullptr) {
llvm::SmallVector<mlir::Attribute> deviceTypes;
// Collect the 'existing' device-type attributes so we can re-create them
@@ -120,13 +122,19 @@ class OpenACCClauseCIREmitter final
lastDeviceTypeClause->getArchitectures()) {
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
builder.getContext(), decodeDeviceType(arch.getIdentifierInfo())));
- argCollection.append(argument);
+ if (argument) {
+ assert(argCollection);
+ argCollection->append(*argument);
+ }
}
} else {
// Else, we just add a single for 'none'.
deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
builder.getContext(), mlir::acc::DeviceType::None));
- argCollection.append(argument);
+ if (argument) {
+ assert(argCollection);
+ argCollection->append(*argument);
+ }
}
return mlir::ArrayAttr::get(builder.getContext(), deviceTypes);
@@ -205,7 +213,7 @@ class OpenACCClauseCIREmitter final
mlir::MutableOperandRange range = operation.getNumWorkersMutable();
operation.setNumWorkersDeviceTypeAttr(handleDeviceTypeAffectedClause(
operation.getNumWorkersDeviceTypeAttr(),
- createIntExpr(clause.getIntExpr()), range));
+ createIntExpr(clause.getIntExpr()), &range));
} else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
llvm_unreachable("num_workers not valid on serial");
} else {
@@ -218,7 +226,7 @@ class OpenACCClauseCIREmitter final
mlir::MutableOperandRange range = operation.getVectorLengthMutable();
operation.setVectorLengthDeviceTypeAttr(handleDeviceTypeAffectedClause(
operation.getVectorLengthDeviceTypeAttr(),
- createIntExpr(clause.getIntExpr()), range));
+ createIntExpr(clause.getIntExpr()), &range));
} else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
llvm_unreachable("vector_length not valid on serial");
} else {
@@ -226,6 +234,22 @@ class OpenACCClauseCIREmitter final
}
}
+ void VisitAsyncClause(const OpenACCAsyncClause &clause) {
+ if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
+ if (!clause.hasIntExpr()) {
+ operation.setAsyncOnlyAttr(
+ handleDeviceTypeAffectedClause(operation.getAsyncOnlyAttr()));
+ } else {
+ mlir::MutableOperandRange range = operation.getAsyncOperandsMutable();
+ operation.setAsyncOperandsDeviceTypeAttr(handleDeviceTypeAffectedClause(
+ operation.getAsyncOperandsDeviceTypeAttr(),
+ createIntExpr(clause.getIntExpr()), &range));
+ }
+ } else {
+ return clauseNotImplemented(clause);
+ }
+ }
+
void VisitSelfClause(const OpenACCSelfClause &clause) {
if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
if (clause.isEmptySelfClause()) {
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp
index 3694a831b76de..ed437ac62e332 100644
--- a/clang/lib/Sema/SemaOpenACCClause.cpp
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -639,6 +639,9 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
SemaOpenACC::OpenACCParsedClause &Clause) {
+ if (DisallowSinceLastDeviceType<OpenACCAsyncClause>(Clause))
+ return nullptr;
+
assert(Clause.getNumIntExprs() < 2 &&
"Invalid number of expressions for Async");
return OpenACCAsyncClause::Create(
diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c
index d2da1d18f1534..1744acf0ab223 100644
--- a/clang/test/CIR/CodeGenOpenACC/kernels.c
+++ b/clang/test/CIR/CodeGenOpenACC/kernels.c
@@ -210,5 +210,51 @@ void acc_kernels(int cond) {
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc
+#pragma acc kernels async
+ {}
+ // CHECK-NEXT: acc.kernels {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc kernels async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels async device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: acc.kernels {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc kernels async(3) device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels async device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc kernels async(3) device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32) {
+ // CHECK-NEXT: acc.terminator
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
// CHECK-NEXT: cir.return
}
diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c
index 61dccc591c252..892d931c880e7 100644
--- a/clang/test/CIR/CodeGenOpenACC/parallel.c
+++ b/clang/test/CIR/CodeGenOpenACC/parallel.c
@@ -209,5 +209,51 @@ void acc_parallel(int cond) {
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
+#pragma acc parallel async
+ {}
+ // CHECK-NEXT: acc.parallel {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc parallel async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc parallel async device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: acc.parallel {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc parallel async(3) device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc parallel async device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc parallel async(3) device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
// CHECK-NEXT: cir.return
}
diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c b/clang/test/CIR/CodeGenOpenACC/serial.c
index b72f44a2ea473..094958f0e3b23 100644
--- a/clang/test/CIR/CodeGenOpenACC/serial.c
+++ b/clang/test/CIR/CodeGenOpenACC/serial.c
@@ -106,5 +106,51 @@ void acc_serial(int cond) {
// CHECK-NEXT: acc.yield
// CHECK-NEXT: } loc
+#pragma acc serial async
+ {}
+ // CHECK-NEXT: acc.serial {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc serial async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc serial async device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: acc.serial {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc serial async(3) device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc serial async device_type(nvidia, radeon) async(cond)
+ {}
+ // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc serial async(3) device_type(nvidia, radeon) async
+ {}
+ // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+ // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+ // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32) {
+ // CHECK-NEXT: acc.yield
+ // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
// CHECK-NEXT: cir.return
}
diff --git a/clang/test/SemaOpenACC/compute-construct-async-clause.c b/clang/test/SemaOpenACC/compute-construct-async-clause.c
index 4895d7f2209bb..4ca963713254c 100644
--- a/clang/test/SemaOpenACC/compute-construct-async-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-async-clause.c
@@ -20,6 +20,48 @@ void Test() {
#pragma acc serial async(1, 2)
while(1);
+ // expected-error at +2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}}
+ // expected-note at +1{{previous clause is here}}
+#pragma acc kernels async async
+ while(1);
+
+ // expected-error at +2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}}
+ // expected-note at +1{{previous clause is here}}
+#pragma acc kernels async(1) async(2)
+ while(1);
+
+ // expected-error at +2{{OpenACC 'async' clause cannot appear more than once on a 'parallel' directive}}
+ // expected-note at +1{{previous clause is here}}
+#pragma acc parallel async(1) async(2)
+ while(1);
+
+ // expected-error at +2{{OpenACC 'async' clause cannot appear more than once on a 'serial' directive}}
+ // expected-note at +1{{previous clause is here}}
+#pragma acc serial async(1) async(2)
+ while(1);
+
+ // expected-error at +3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'kernels' directive}}
+ // expected-note at +2{{previous clause is here}}
+ // expected-note at +1{{previous clause is here}}
+#pragma acc kernels async(1) device_type(*) async(1) async(2)
+ while(1);
+ // expected-error at +3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}}
+ // expected-note at +2{{previous clause is here}}
+ // expected-note at +1{{previous clause is here}}
+#pragma acc parallel async device_type(*) async async
+ while(1);
+ // expected-error at +3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'serial' directive}}
+ // expected-note at +2{{previous clause is here}}
+ // expected-note at +1{{previous clause is here}}
+#pragma acc serial async(1) device_type(*) async async(2)
+ while(1);
+
+ // expected-error at +3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}}
+ // expected-note at +2{{previous clause is here}}
+ // expected-note at +1{{previous clause is here}}
+#pragma acc parallel device_type(*) async async
+ while(1);
+
struct NotConvertible{} NC;
// expected-error at +1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
#pragma acc parallel async(NC)
``````````
</details>
https://github.com/llvm/llvm-project/pull/136626
More information about the cfe-commits
mailing list