[clang] [OpenACC][CIR] Implement 'async' lowering. (PR #136626)

Erich Keane via cfe-commits cfe-commits at lists.llvm.org
Tue Apr 22 06:10:54 PDT 2025


https://github.com/erichkeane updated https://github.com/llvm/llvm-project/pull/136626

>From 554256a719265abeb3cac278fbb1a19d7b989545 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane at nvidia.com>
Date: Mon, 21 Apr 2025 12:54:23 -0700
Subject: [PATCH 1/2] [OpenACC][CIR] Implement 'async' lowering.

Async acts just like num_workers/vector_length in that it gets a new
variant per device_type and is lowered as an operand.

However, it has one additional complication, in that it can have a
variant that has no argument, which produces an attribute with the
correct devicetype.

Additionally, this syncronizes us with the implementation of flang,
  which prohibits multiple 'async' clauses per-device_type.
---
 clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp   | 42 +++++++++++++----
 clang/lib/Sema/SemaOpenACCClause.cpp          |  3 ++
 clang/test/CIR/CodeGenOpenACC/kernels.c       | 46 +++++++++++++++++++
 clang/test/CIR/CodeGenOpenACC/parallel.c      | 46 +++++++++++++++++++
 clang/test/CIR/CodeGenOpenACC/serial.c        | 46 +++++++++++++++++++
 .../compute-construct-async-clause.c          | 42 +++++++++++++++++
 6 files changed, 216 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index e7dd2e74b0864..82fbb49db3bc8 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -97,11 +97,13 @@ class OpenACCClauseCIREmitter final
 
   // Handle a clause affected by the 'device-type' to the point that they need
   // to have the attributes added in the correct/corresponding order, such as
-  // 'num_workers' or 'vector_length' on a compute construct.
-  mlir::ArrayAttr
-  handleDeviceTypeAffectedClause(mlir::ArrayAttr existingDeviceTypes,
-                                 mlir::Value argument,
-                                 mlir::MutableOperandRange &argCollection) {
+  // 'num_workers' or 'vector_length' on a compute construct. For cases where we
+  // don't have an argument that needs to be added to an additional one (such as
+  // asyncOnly) we can use this with 'argument' as std::nullopt.
+  mlir::ArrayAttr handleDeviceTypeAffectedClause(
+      mlir::ArrayAttr existingDeviceTypes,
+      std::optional<mlir::Value> argument = std::nullopt,
+      mlir::MutableOperandRange *argCollection = nullptr) {
     llvm::SmallVector<mlir::Attribute> deviceTypes;
 
     // Collect the 'existing' device-type attributes so we can re-create them
@@ -120,13 +122,19 @@ class OpenACCClauseCIREmitter final
            lastDeviceTypeClause->getArchitectures()) {
         deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
             builder.getContext(), decodeDeviceType(arch.getIdentifierInfo())));
-        argCollection.append(argument);
+        if (argument) {
+          assert(argCollection);
+          argCollection->append(*argument);
+        }
       }
     } else {
       // Else, we just add a single for 'none'.
       deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
           builder.getContext(), mlir::acc::DeviceType::None));
-      argCollection.append(argument);
+      if (argument) {
+        assert(argCollection);
+        argCollection->append(*argument);
+      }
     }
 
     return mlir::ArrayAttr::get(builder.getContext(), deviceTypes);
@@ -205,7 +213,7 @@ class OpenACCClauseCIREmitter final
       mlir::MutableOperandRange range = operation.getNumWorkersMutable();
       operation.setNumWorkersDeviceTypeAttr(handleDeviceTypeAffectedClause(
           operation.getNumWorkersDeviceTypeAttr(),
-          createIntExpr(clause.getIntExpr()), range));
+          createIntExpr(clause.getIntExpr()), &range));
     } else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
       llvm_unreachable("num_workers not valid on serial");
     } else {
@@ -218,7 +226,7 @@ class OpenACCClauseCIREmitter final
       mlir::MutableOperandRange range = operation.getVectorLengthMutable();
       operation.setVectorLengthDeviceTypeAttr(handleDeviceTypeAffectedClause(
           operation.getVectorLengthDeviceTypeAttr(),
-          createIntExpr(clause.getIntExpr()), range));
+          createIntExpr(clause.getIntExpr()), &range));
     } else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
       llvm_unreachable("vector_length not valid on serial");
     } else {
@@ -226,6 +234,22 @@ class OpenACCClauseCIREmitter final
     }
   }
 
+  void VisitAsyncClause(const OpenACCAsyncClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
+      if (!clause.hasIntExpr()) {
+        operation.setAsyncOnlyAttr(
+            handleDeviceTypeAffectedClause(operation.getAsyncOnlyAttr()));
+      } else {
+        mlir::MutableOperandRange range = operation.getAsyncOperandsMutable();
+        operation.setAsyncOperandsDeviceTypeAttr(handleDeviceTypeAffectedClause(
+            operation.getAsyncOperandsDeviceTypeAttr(),
+            createIntExpr(clause.getIntExpr()), &range));
+      }
+    } else {
+      return clauseNotImplemented(clause);
+    }
+  }
+
   void VisitSelfClause(const OpenACCSelfClause &clause) {
     if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
       if (clause.isEmptySelfClause()) {
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp
index 3694a831b76de..ed437ac62e332 100644
--- a/clang/lib/Sema/SemaOpenACCClause.cpp
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -639,6 +639,9 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
+  if (DisallowSinceLastDeviceType<OpenACCAsyncClause>(Clause))
+    return nullptr;
+
   assert(Clause.getNumIntExprs() < 2 &&
          "Invalid number of expressions for Async");
   return OpenACCAsyncClause::Create(
diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c
index d2da1d18f1534..1744acf0ab223 100644
--- a/clang/test/CIR/CodeGenOpenACC/kernels.c
+++ b/clang/test/CIR/CodeGenOpenACC/kernels.c
@@ -210,5 +210,51 @@ void acc_kernels(int cond) {
   // CHECK-NEXT: acc.terminator
   // CHECK-NEXT: } loc
 
+#pragma acc kernels async
+  {}
+  // CHECK-NEXT: acc.kernels {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc kernels async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels async device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: acc.kernels {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc kernels async(3) device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels async device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc kernels async(3) device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels async(%[[THREE_CAST]] : si32) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
   // CHECK-NEXT: cir.return
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c
index 61dccc591c252..892d931c880e7 100644
--- a/clang/test/CIR/CodeGenOpenACC/parallel.c
+++ b/clang/test/CIR/CodeGenOpenACC/parallel.c
@@ -209,5 +209,51 @@ void acc_parallel(int cond) {
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 
+#pragma acc parallel async
+  {}
+  // CHECK-NEXT: acc.parallel {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc parallel async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel async device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: acc.parallel {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc parallel async(3) device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel async device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc parallel async(3) device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel async(%[[THREE_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
   // CHECK-NEXT: cir.return
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/serial.c b/clang/test/CIR/CodeGenOpenACC/serial.c
index b72f44a2ea473..094958f0e3b23 100644
--- a/clang/test/CIR/CodeGenOpenACC/serial.c
+++ b/clang/test/CIR/CodeGenOpenACC/serial.c
@@ -106,5 +106,51 @@ void acc_serial(int cond) {
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 
+#pragma acc serial async
+  {}
+  // CHECK-NEXT: acc.serial {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc serial async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial async device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: acc.serial {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
+#pragma acc serial async(3) device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc serial async device_type(nvidia, radeon) async(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.serial async(%[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<none>]}
+
+#pragma acc serial async(3) device_type(nvidia, radeon) async
+  {}
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.serial async(%[[THREE_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } attributes {asyncOnly = [#acc.device_type<nvidia>, #acc.device_type<radeon>]}
+
   // CHECK-NEXT: cir.return
 }
diff --git a/clang/test/SemaOpenACC/compute-construct-async-clause.c b/clang/test/SemaOpenACC/compute-construct-async-clause.c
index 4895d7f2209bb..4ca963713254c 100644
--- a/clang/test/SemaOpenACC/compute-construct-async-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-async-clause.c
@@ -20,6 +20,48 @@ void Test() {
 #pragma acc serial async(1, 2)
   while(1);
 
+  // expected-error at +2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}}
+  // expected-note at +1{{previous clause is here}}
+#pragma acc kernels async async
+  while(1);
+
+  // expected-error at +2{{OpenACC 'async' clause cannot appear more than once on a 'kernels' directive}}
+  // expected-note at +1{{previous clause is here}}
+#pragma acc kernels async(1) async(2)
+  while(1);
+
+  // expected-error at +2{{OpenACC 'async' clause cannot appear more than once on a 'parallel' directive}}
+  // expected-note at +1{{previous clause is here}}
+#pragma acc parallel async(1) async(2)
+  while(1);
+
+  // expected-error at +2{{OpenACC 'async' clause cannot appear more than once on a 'serial' directive}}
+  // expected-note at +1{{previous clause is here}}
+#pragma acc serial async(1) async(2)
+  while(1);
+
+  // expected-error at +3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'kernels' directive}}
+  // expected-note at +2{{previous clause is here}}
+  // expected-note at +1{{previous clause is here}}
+#pragma acc kernels async(1) device_type(*) async(1) async(2)
+  while(1);
+  // expected-error at +3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}}
+  // expected-note at +2{{previous clause is here}}
+  // expected-note at +1{{previous clause is here}}
+#pragma acc parallel async device_type(*) async async
+  while(1);
+  // expected-error at +3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'serial' directive}}
+  // expected-note at +2{{previous clause is here}}
+  // expected-note at +1{{previous clause is here}}
+#pragma acc serial async(1) device_type(*) async async(2)
+  while(1);
+
+  // expected-error at +3{{OpenACC 'async' clause cannot appear more than once in a 'device_type' region on a 'parallel' directive}}
+  // expected-note at +2{{previous clause is here}}
+  // expected-note at +1{{previous clause is here}}
+#pragma acc parallel device_type(*) async async
+  while(1);
+
   struct NotConvertible{} NC;
   // expected-error at +1{{OpenACC clause 'async' requires expression of integer type ('struct NotConvertible' invalid)}}
 #pragma acc parallel async(NC)

>From 34585b6ace3ab0b93d14ec3444fd53bbf0c8f5fa Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane at nvidia.com>
Date: Tue, 22 Apr 2025 06:10:42 -0700
Subject: [PATCH 2/2] Update comments as suggested by andy and morris

---
 clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 8 ++++++--
 clang/lib/Sema/SemaOpenACCClause.cpp        | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index 82fbb49db3bc8..016481f136d05 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -98,8 +98,12 @@ class OpenACCClauseCIREmitter final
   // Handle a clause affected by the 'device-type' to the point that they need
   // to have the attributes added in the correct/corresponding order, such as
   // 'num_workers' or 'vector_length' on a compute construct. For cases where we
-  // don't have an argument that needs to be added to an additional one (such as
-  // asyncOnly) we can use this with 'argument' as std::nullopt.
+  // don't have an expression 'argument' that needs to be added to an operand
+  // and only care about the 'device-type' list, we can use this with 'argument'
+  // as 'std::nullopt'.   If 'argument' is NOT 'std::nullopt' (that is, has a
+  // value), argCollection must also be non-null. For cases where we don't have
+  // an argument that needs to be added to an additional one (such as asyncOnly)
+  // we can use this with 'argument' as std::nullopt.
   mlir::ArrayAttr handleDeviceTypeAffectedClause(
       mlir::ArrayAttr existingDeviceTypes,
       std::optional<mlir::Value> argument = std::nullopt,
diff --git a/clang/lib/Sema/SemaOpenACCClause.cpp b/clang/lib/Sema/SemaOpenACCClause.cpp
index ed437ac62e332..fba40c834e703 100644
--- a/clang/lib/Sema/SemaOpenACCClause.cpp
+++ b/clang/lib/Sema/SemaOpenACCClause.cpp
@@ -334,7 +334,7 @@ class SemaOpenACCClauseVisitor {
   }
 
   // For 'tile' and 'collapse', only allow 1 per 'device_type'.
-  // Also applies to num_worker, num_gangs, and vector_length.
+  // Also applies to num_worker, num_gangs, vector_length, and async.
   template <typename TheClauseTy>
   bool DisallowSinceLastDeviceType(SemaOpenACC::OpenACCParsedClause &Clause) {
     auto LastDeviceTypeItr =



More information about the cfe-commits mailing list