[clang] [OpenACC][CIR] Lower 'num_workers' for parallel/kernels (PR #136578)

Erich Keane via cfe-commits cfe-commits at lists.llvm.org
Mon Apr 21 10:47:55 PDT 2025


https://github.com/erichkeane updated https://github.com/llvm/llvm-project/pull/136578

>From e12a5016b203446d68019b4f18c19573b481aef7 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane at nvidia.com>
Date: Wed, 16 Apr 2025 08:33:35 -0700
Subject: [PATCH 1/2] [OpenACC][CIR] Lower 'num_workers' for parallel/kernels

This patch also includes the first one to handle 'device_type' properly,
which is where most of the 'challenge' here comes from.

>From the best I can tell: we must keep two lists of the same size, 1 of
all of the 'num_workers' items, and 1 of the 'device_type' value for
that 'num_workers'.  Additionally, the 'device_type' list can only
handle single 'device_type' values, so we have to duplicate the
'num_workers' items in cases where there are multiple applicable
'device_type' values.

This patch accomplishes this by keeping the two in sync, and saving the
current 'device_type' in the visitor.
---
 clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 63 +++++++++++++++++++++
 clang/test/CIR/CodeGenOpenACC/kernels.c     | 52 +++++++++++++++++
 clang/test/CIR/CodeGenOpenACC/parallel.c    | 52 +++++++++++++++++
 3 files changed, 167 insertions(+)

diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index 30e840cbfa1f7..c7b5c3348e77d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -46,10 +46,27 @@ class OpenACCClauseCIREmitter final
   // diagnostics are gone.
   SourceLocation dirLoc;
 
+  const OpenACCDeviceTypeClause *lastDeviceTypeClause = nullptr;
+
   void clauseNotImplemented(const OpenACCClause &c) {
     cgf.cgm.errorNYI(c.getSourceRange(), "OpenACC Clause", c.getClauseKind());
   }
 
+  mlir::Value createIntExpr(const Expr *intExpr) {
+    mlir::Value expr = cgf.emitScalarExpr(intExpr);
+    mlir::Location exprLoc = cgf.cgm.getLoc(intExpr->getBeginLoc());
+
+    mlir::IntegerType targetType = mlir::IntegerType::get(
+        &cgf.getMLIRContext(), cgf.getContext().getIntWidth(intExpr->getType()),
+        intExpr->getType()->isSignedIntegerOrEnumerationType()
+            ? mlir::IntegerType::SignednessSemantics::Signed
+            : mlir::IntegerType::SignednessSemantics::Unsigned);
+
+    auto conversionOp = builder.create<mlir::UnrealizedConversionCastOp>(
+        exprLoc, targetType, expr);
+    return conversionOp.getResult(0);
+  }
+
   // 'condition' as an OpenACC grammar production is used for 'if' and (some
   // variants of) 'self'.  It needs to be emitted as a signless-1-bit value, so
   // this function emits the expression, then sets the unrealized conversion
@@ -109,6 +126,7 @@ class OpenACCClauseCIREmitter final
   }
 
   void VisitDeviceTypeClause(const OpenACCDeviceTypeClause &clause) {
+    lastDeviceTypeClause = &clause;
     if constexpr (isOneOfTypes<OpTy, InitOp, ShutdownOp>) {
       llvm::SmallVector<mlir::Attribute> deviceTypes;
       std::optional<mlir::ArrayAttr> existingDeviceTypes =
@@ -136,6 +154,51 @@ class OpenACCClauseCIREmitter final
       if (!clause.getArchitectures().empty())
         operation.setDeviceType(
             decodeDeviceType(clause.getArchitectures()[0].getIdentifierInfo()));
+    } else if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
+      // Nothing to do here, these constructs don't have any IR for these, as
+      // they just modify the other clauses IR.  So setting of `lastDeviceType`
+      // (done above) is all we need.
+    } else {
+      return clauseNotImplemented(clause);
+    }
+  }
+
+  void VisitNumWorkersClause(const OpenACCNumWorkersClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, ParallelOp, KernelsOp>) {
+      // Collect the 'existing' device-type attributes so we can re-create them
+      // and insert them.
+      llvm::SmallVector<mlir::Attribute> deviceTypes;
+      mlir::ArrayAttr existingDeviceTypes =
+          operation.getNumWorkersDeviceTypeAttr();
+
+      if (existingDeviceTypes) {
+        for (const mlir::Attribute &Attr : existingDeviceTypes)
+          deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+              builder.getContext(),
+              cast<mlir::acc::DeviceTypeAttr>(Attr).getValue()));
+      }
+
+      // Insert 1 version of the 'int-expr' to the NumWorkers list per-current
+      // device type.
+      mlir::Value intExpr = createIntExpr(clause.getIntExpr());
+      if (lastDeviceTypeClause) {
+        for (const DeviceTypeArgument &arg :
+             lastDeviceTypeClause->getArchitectures()) {
+          deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+              builder.getContext(), decodeDeviceType(arg.getIdentifierInfo())));
+          operation.getNumWorkersMutable().append(intExpr);
+        }
+      } else {
+        // Else, we just add a single for 'none'.
+        deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+            builder.getContext(), mlir::acc::DeviceType::None));
+        operation.getNumWorkersMutable().append(intExpr);
+      }
+
+      operation.setNumWorkersDeviceTypeAttr(
+          mlir::ArrayAttr::get(builder.getContext(), deviceTypes));
+    } else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
+      llvm_unreachable("num_workers not valid on serial");
     } else {
       return clauseNotImplemented(clause);
     }
diff --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c
index ca5bfebcb4ff3..6459b310546cd 100644
--- a/clang/test/CIR/CodeGenOpenACC/kernels.c
+++ b/clang/test/CIR/CodeGenOpenACC/kernels.c
@@ -106,5 +106,57 @@ void acc_kernels(int cond) {
   // CHECK-NEXT: acc.terminator
   // CHECK-NEXT: } loc
 
+#pragma acc kernels num_workers(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2u)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
+  // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>], %[[THREE_CAST]] : si32 [#acc.device_type<multicore>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
   // CHECK-NEXT: cir.return
 }
diff --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c
index 3fb0b987409db..bdb506ee7e1d2 100644
--- a/clang/test/CIR/CodeGenOpenACC/parallel.c
+++ b/clang/test/CIR/CodeGenOpenACC/parallel.c
@@ -105,5 +105,57 @@ void acc_parallel(int cond) {
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 
+#pragma acc parallel num_workers(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2u)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
+  // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(4)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i
+  // CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[FOUR_CAST]] : si32 [#acc.device_type<radeon>], %[[FOUR_CAST]] : si32 [#acc.device_type<multicore>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
   // CHECK-NEXT: cir.return
 }

>From 40766ab6521fcc8bd2a082b26960274cbe58cff8 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane at nvidia.com>
Date: Mon, 21 Apr 2025 10:46:45 -0700
Subject: [PATCH 2/2] Correct device-type for loops

---
 clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index c7b5c3348e77d..604fdf369860e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -134,7 +134,7 @@ class OpenACCClauseCIREmitter final
 
       // Ensure we keep the existing ones, and in the correct 'new' order.
       if (existingDeviceTypes) {
-        for (const mlir::Attribute &Attr : *existingDeviceTypes)
+        for (mlir::Attribute Attr : *existingDeviceTypes)
           deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
               builder.getContext(),
               cast<mlir::acc::DeviceTypeAttr>(Attr).getValue()));
@@ -172,7 +172,7 @@ class OpenACCClauseCIREmitter final
           operation.getNumWorkersDeviceTypeAttr();
 
       if (existingDeviceTypes) {
-        for (const mlir::Attribute &Attr : existingDeviceTypes)
+        for (mlir::Attribute Attr : existingDeviceTypes)
           deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
               builder.getContext(),
               cast<mlir::acc::DeviceTypeAttr>(Attr).getValue()));



More information about the cfe-commits mailing list