[clang] 0ae9dac - [OpenACC][CIR] Lower 'num_workers' for parallel/kernels (#136578)

Mon Apr 21 11:51:30 PDT 2025

Author: Erich Keane
Date: 2025-04-21T11:51:26-07:00
New Revision: 0ae9dac262afccea1e1a2e02520f737ab38b286c

URL: https://github.com/llvm/llvm-project/commit/0ae9dac262afccea1e1a2e02520f737ab38b286c
DIFF: https://github.com/llvm/llvm-project/commit/0ae9dac262afccea1e1a2e02520f737ab38b286c.diff

LOG: [OpenACC][CIR] Lower 'num_workers' for parallel/kernels (#136578)

This patch also includes the first one to handle 'device_type' properly,
which is where most of the 'challenge' here comes from.

>From the best I can tell: we must keep two lists of the same size, 1 of
all of the 'num_workers' items, and 1 of the 'device_type' value for
that 'num_workers'. Additionally, the 'device_type' list can only handle
single 'device_type' values, so we have to duplicate the 'num_workers'
items in cases where there are multiple applicable 'device_type' values.

This patch accomplishes this by keeping the two in sync, and saving the
current 'device_type' in the visitor.

Added: 
    

Modified: 
    clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
    clang/test/CIR/CodeGenOpenACC/kernels.c
    clang/test/CIR/CodeGenOpenACC/parallel.c

Removed: 
    


################################################################################
diff  --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index 30e840cbfa1f7..604fdf369860e 100644

--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -46,10 +46,27 @@ class OpenACCClauseCIREmitter final
   // diagnostics are gone.
   SourceLocation dirLoc;
 
+  const OpenACCDeviceTypeClause *lastDeviceTypeClause = nullptr;
+
   void clauseNotImplemented(const OpenACCClause &c) {
     cgf.cgm.errorNYI(c.getSourceRange(), "OpenACC Clause", c.getClauseKind());
   }
 
+  mlir::Value createIntExpr(const Expr *intExpr) {
+    mlir::Value expr = cgf.emitScalarExpr(intExpr);
+    mlir::Location exprLoc = cgf.cgm.getLoc(intExpr->getBeginLoc());
+
+    mlir::IntegerType targetType = mlir::IntegerType::get(
+        &cgf.getMLIRContext(), cgf.getContext().getIntWidth(intExpr->getType()),
+        intExpr->getType()->isSignedIntegerOrEnumerationType()
+            ? mlir::IntegerType::SignednessSemantics::Signed
+            : mlir::IntegerType::SignednessSemantics::Unsigned);
+
+    auto conversionOp = builder.create<mlir::UnrealizedConversionCastOp>(
+        exprLoc, targetType, expr);
+    return conversionOp.getResult(0);
+  }
+
   // 'condition' as an OpenACC grammar production is used for 'if' and (some
   // variants of) 'self'.  It needs to be emitted as a signless-1-bit value, so
   // this function emits the expression, then sets the unrealized conversion
@@ -109,6 +126,7 @@ class OpenACCClauseCIREmitter final
   }
 
   void VisitDeviceTypeClause(const OpenACCDeviceTypeClause &clause) {
+    lastDeviceTypeClause = &clause;
     if constexpr (isOneOfTypes<OpTy, InitOp, ShutdownOp>) {
       llvm::SmallVector<mlir::Attribute> deviceTypes;
       std::optional<mlir::ArrayAttr> existingDeviceTypes =
@@ -116,7 +134,7 @@ class OpenACCClauseCIREmitter final
 
       // Ensure we keep the existing ones, and in the correct 'new' order.
       if (existingDeviceTypes) {
-        for (const mlir::Attribute &Attr : *existingDeviceTypes)
+        for (mlir::Attribute Attr : *existingDeviceTypes)
           deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
               builder.getContext(),
               cast<mlir::acc::DeviceTypeAttr>(Attr).getValue()));
@@ -136,6 +154,51 @@ class OpenACCClauseCIREmitter final
       if (!clause.getArchitectures().empty())
         operation.setDeviceType(
             decodeDeviceType(clause.getArchitectures()[0].getIdentifierInfo()));
+    } else if constexpr (isOneOfTypes<OpTy, ParallelOp, SerialOp, KernelsOp>) {
+      // Nothing to do here, these constructs don't have any IR for these, as
+      // they just modify the other clauses IR.  So setting of `lastDeviceType`
+      // (done above) is all we need.
+    } else {
+      return clauseNotImplemented(clause);
+    }
+  }
+
+  void VisitNumWorkersClause(const OpenACCNumWorkersClause &clause) {
+    if constexpr (isOneOfTypes<OpTy, ParallelOp, KernelsOp>) {
+      // Collect the 'existing' device-type attributes so we can re-create them
+      // and insert them.
+      llvm::SmallVector<mlir::Attribute> deviceTypes;
+      mlir::ArrayAttr existingDeviceTypes =
+          operation.getNumWorkersDeviceTypeAttr();
+
+      if (existingDeviceTypes) {
+        for (mlir::Attribute Attr : existingDeviceTypes)
+          deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+              builder.getContext(),
+              cast<mlir::acc::DeviceTypeAttr>(Attr).getValue()));
+      }
+
+      // Insert 1 version of the 'int-expr' to the NumWorkers list per-current
+      // device type.
+      mlir::Value intExpr = createIntExpr(clause.getIntExpr());
+      if (lastDeviceTypeClause) {
+        for (const DeviceTypeArgument &arg :
+             lastDeviceTypeClause->getArchitectures()) {
+          deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+              builder.getContext(), decodeDeviceType(arg.getIdentifierInfo())));
+          operation.getNumWorkersMutable().append(intExpr);
+        }
+      } else {
+        // Else, we just add a single for 'none'.
+        deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+            builder.getContext(), mlir::acc::DeviceType::None));
+        operation.getNumWorkersMutable().append(intExpr);
+      }
+
+      operation.setNumWorkersDeviceTypeAttr(
+          mlir::ArrayAttr::get(builder.getContext(), deviceTypes));
+    } else if constexpr (isOneOfTypes<OpTy, SerialOp>) {
+      llvm_unreachable("num_workers not valid on serial");
     } else {
       return clauseNotImplemented(clause);
     }

diff  --git a/clang/test/CIR/CodeGenOpenACC/kernels.c b/clang/test/CIR/CodeGenOpenACC/kernels.c
index ca5bfebcb4ff3..6459b310546cd 100644
--- a/clang/test/CIR/CodeGenOpenACC/kernels.c
+++ b/clang/test/CIR/CodeGenOpenACC/kernels.c
@@ -106,5 +106,57 @@ void acc_kernels(int cond) {
   // CHECK-NEXT: acc.terminator
   // CHECK-NEXT: } loc
 
+#pragma acc kernels num_workers(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2u)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
+  // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>], %[[THREE_CAST]] : si32 [#acc.device_type<multicore>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
+#pragma acc kernels device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.kernels num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.terminator
+  // CHECK-NEXT: } loc
+
   // CHECK-NEXT: cir.return
 }

diff  --git a/clang/test/CIR/CodeGenOpenACC/parallel.c b/clang/test/CIR/CodeGenOpenACC/parallel.c
index 3fb0b987409db..bdb506ee7e1d2 100644
--- a/clang/test/CIR/CodeGenOpenACC/parallel.c
+++ b/clang/test/CIR/CodeGenOpenACC/parallel.c
@@ -105,5 +105,57 @@ void acc_parallel(int cond) {
   // CHECK-NEXT: acc.yield
   // CHECK-NEXT: } loc
 
+#pragma acc parallel num_workers(cond)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2u)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !u32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !u32i to ui32
+  // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : ui32 [#acc.device_type<nvidia>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel num_workers(cond) device_type(nvidia, host) num_workers(2) device_type(radeon) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[TWO_CAST]] : si32 [#acc.device_type<host>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel num_workers(cond) device_type(nvidia) num_workers(2) device_type(radeon, multicore) num_workers(4)
+  {}
+  // CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[FOUR_LITERAL:.*]] = cir.const #cir.int<4> : !s32i
+  // CHECK-NEXT: %[[FOUR_CAST:.*]] = builtin.unrealized_conversion_cast %[[FOUR_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel num_workers(%[[CONV_CAST]] : si32, %[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[FOUR_CAST]] : si32 [#acc.device_type<radeon>], %[[FOUR_CAST]] : si32 [#acc.device_type<multicore>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
+#pragma acc parallel device_type(nvidia) num_workers(2) device_type(radeon) num_workers(3)
+  {}
+  // CHECK-NEXT: %[[TWO_LITERAL:.*]] = cir.const #cir.int<2> : !s32i
+  // CHECK-NEXT: %[[TWO_CAST:.*]] = builtin.unrealized_conversion_cast %[[TWO_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
+  // CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
+  // CHECK-NEXT: acc.parallel num_workers(%[[TWO_CAST]] : si32 [#acc.device_type<nvidia>], %[[THREE_CAST]] : si32 [#acc.device_type<radeon>]) {
+  // CHECK-NEXT: acc.yield
+  // CHECK-NEXT: } loc
+
   // CHECK-NEXT: cir.return
 }