[Mlir-commits] [mlir] f4e7ba0 - [OpenACC][CIR] Implement 'worker'/'vector' lowering (#138765)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Wed May 7 13:48:20 PDT 2025
Author: Erich Keane
Date: 2025-05-07T13:48:17-07:00
New Revision: f4e7ba02cc7fd35f3e5ad82cf98c3220af7cd068
URL: https://github.com/llvm/llvm-project/commit/f4e7ba02cc7fd35f3e5ad82cf98c3220af7cd068
DIFF: https://github.com/llvm/llvm-project/commit/f4e7ba02cc7fd35f3e5ad82cf98c3220af7cd068.diff
LOG: [OpenACC][CIR] Implement 'worker'/'vector' lowering (#138765)
This patch implements worker and vector lowering for the loop construct,
which are fairly simple clauses, except that they also have a 'no
argument' form which requires a touch more work. Else, these are just
like a handful of other clauses where we just keep the device_type array
and operands in sync.
Added:
Modified:
clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
clang/test/CIR/CodeGenOpenACC/loop.cpp
mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
Removed:
################################################################################
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
index fa4ce5efc39ad..ef4f64a167742 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
@@ -392,6 +392,38 @@ class OpenACCClauseCIREmitter final
return clauseNotImplemented(clause);
}
}
+
+ void VisitWorkerClause(const OpenACCWorkerClause &clause) {
+ if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+ if (clause.hasIntExpr())
+ operation.addWorkerNumOperand(builder.getContext(),
+ createIntExpr(clause.getIntExpr()),
+ lastDeviceTypeValues);
+ else
+ operation.addEmptyWorker(builder.getContext(), lastDeviceTypeValues);
+
+ } else {
+ // TODO: When we've implemented this for everything, switch this to an
+ // unreachable. Combined constructs remain.
+ return clauseNotImplemented(clause);
+ }
+ }
+
+ void VisitVectorClause(const OpenACCVectorClause &clause) {
+ if constexpr (isOneOfTypes<OpTy, mlir::acc::LoopOp>) {
+ if (clause.hasIntExpr())
+ operation.addVectorOperand(builder.getContext(),
+ createIntExpr(clause.getIntExpr()),
+ lastDeviceTypeValues);
+ else
+ operation.addEmptyVector(builder.getContext(), lastDeviceTypeValues);
+
+ } else {
+ // TODO: When we've implemented this for everything, switch this to an
+ // unreachable. Combined constructs remain.
+ return clauseNotImplemented(clause);
+ }
+ }
};
template <typename OpTy>
diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp
index b255a01adda0e..d636d1b37d969 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp
@@ -193,4 +193,134 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) {
// CHECK: acc.yield
// CHECK-NEXT: } loc
+
+#pragma acc kernels
+ {
+
+#pragma acc loop worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK: acc.loop worker {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker device_type(nvidia, radeon) worker(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<none>], %[[N_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) device_type(nvidia, radeon) worker(N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) worker(num:N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+
+#pragma acc loop vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK: acc.loop vector {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector device_type(nvidia, radeon) vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.loop vector([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop vector(N) device_type(nvidia, radeon) vector(N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop device_type(nvidia, radeon) vector(length:N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop vector(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker vector device_type(nvidia) worker vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.loop worker([#acc.device_type<none>, #acc.device_type<nvidia>]) vector([#acc.device_type<none>, #acc.device_type<nvidia>])
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+
+#pragma acc loop worker(N) vector(N) device_type(nvidia) worker(N) vector(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV3:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD3]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD4:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV4:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD4]] : !s32i to si32
+ // CHECK-NEXT: acc.loop worker(%[[N_CONV]] : si32, %[[N_CONV3]] : si32 [#acc.device_type<nvidia>]) vector(%[[N_CONV2]] : si32, %[[N_CONV4]] : si32 [#acc.device_type<nvidia>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ }
}
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 41b01a14a6498..ca564037fad19 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2216,6 +2216,21 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
// values should be integral constants, with the '*' represented as a '-1'.
void setTileForDeviceTypes(MLIRContext *, llvm::ArrayRef<DeviceType>,
mlir::ValueRange);
+
+ // Add a value to the 'vector' list with a current list of device_types.
+ void addVectorOperand(MLIRContext *, mlir::Value,
+ llvm::ArrayRef<DeviceType>);
+ // Add an empty value to the 'vector' list with a current list of
+ // device_types. This is for the case where there is no expression specified
+ // in a 'vector'.
+ void addEmptyVector(MLIRContext *, llvm::ArrayRef<DeviceType>);
+ // Add a value to the 'worker' list with a current list of device_types.
+ void addWorkerNumOperand(MLIRContext *, mlir::Value,
+ llvm::ArrayRef<DeviceType>);
+ // Add an empty value to the 'worker' list with a current list of
+ // device_types. This is for the case where there is no expression specified
+ // in a 'worker'.
+ void addEmptyWorker(MLIRContext *, llvm::ArrayRef<DeviceType>);
}];
let hasCustomAssemblyFormat = 1;
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index f26b3a5143c0b..9f4645a4a7ca8 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -2720,6 +2720,34 @@ void acc::LoopOp::setTileForDeviceTypes(
setTileOperandsSegments(segments);
}
+void acc::LoopOp::addVectorOperand(
+ MLIRContext *context, mlir::Value newValue,
+ llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setVectorOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+ context, getVectorOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+ newValue, getVectorOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyVector(
+ MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setVectorAttr(addDeviceTypeAffectedOperandHelper(context, getVectorAttr(),
+ effectiveDeviceTypes));
+}
+
+void acc::LoopOp::addWorkerNumOperand(
+ MLIRContext *context, mlir::Value newValue,
+ llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setWorkerNumOperandsDeviceTypeAttr(addDeviceTypeAffectedOperandHelper(
+ context, getWorkerNumOperandsDeviceTypeAttr(), effectiveDeviceTypes,
+ newValue, getWorkerNumOperandsMutable()));
+}
+
+void acc::LoopOp::addEmptyWorker(
+ MLIRContext *context, llvm::ArrayRef<DeviceType> effectiveDeviceTypes) {
+ setWorkerAttr(addDeviceTypeAffectedOperandHelper(context, getWorkerAttr(),
+ effectiveDeviceTypes));
+}
+
//===----------------------------------------------------------------------===//
// DataOp
//===----------------------------------------------------------------------===//
More information about the Mlir-commits
mailing list