[clang] 31fd77a - [OpenACC][CIR] worker/vector lowering for combined constructs
via cfe-commits
cfe-commits at lists.llvm.org
Fri May 9 12:54:47 PDT 2025
Author: erichkeane
Date: 2025-05-09T12:54:42-07:00
New Revision: 31fd77aa51a643245f8eb277483554509b771832
URL: https://github.com/llvm/llvm-project/commit/31fd77aa51a643245f8eb277483554509b771832
DIFF: https://github.com/llvm/llvm-project/commit/31fd77aa51a643245f8eb277483554509b771832.diff
LOG: [OpenACC][CIR] worker/vector lowering for combined constructs
Another set of 2 line changes, but makes sure to add sufficient testing.
Added:
Modified:
clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
clang/test/CIR/CodeGenOpenACC/combined.cpp
Removed:
################################################################################
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
index 97624ea509ade..1661e3d0c4b62 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h
@@ -465,6 +465,8 @@ class OpenACCClauseCIREmitter final
else
operation.addEmptyWorker(builder.getContext(), lastDeviceTypeValues);
+ } else if constexpr (isCombinedType<OpTy>) {
+ applyToLoopOp(clause);
} else {
// TODO: When we've implemented this for everything, switch this to an
// unreachable. Combined constructs remain.
@@ -481,6 +483,8 @@ class OpenACCClauseCIREmitter final
else
operation.addEmptyVector(builder.getContext(), lastDeviceTypeValues);
+ } else if constexpr (isCombinedType<OpTy>) {
+ applyToLoopOp(clause);
} else {
// TODO: When we've implemented this for everything, switch this to an
// unreachable. Combined constructs remain.
diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp
index b9fb82af46cca..d79f964f19ebf 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp
@@ -401,4 +401,176 @@ extern "C" void acc_combined(int N) {
// CHECK-NEXT: } loc
// CHECK-NEXT: acc.terminator
// CHECK-NEXT: } loc
+
+#pragma acc kernels loop worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: acc.loop combined(kernels) worker {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop worker(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop worker device_type(nvidia, radeon) worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop worker(N) device_type(nvidia, radeon) worker
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop worker device_type(nvidia, radeon) worker(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>], %[[N_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop worker(N) device_type(nvidia, radeon) worker(N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop device_type(nvidia, radeon) worker(num:N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+
+#pragma acc kernels loop worker vector device_type(nvidia) worker vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: acc.loop combined(kernels) worker([#acc.device_type<none>, #acc.device_type<nvidia>]) vector([#acc.device_type<none>, #acc.device_type<nvidia>])
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK: acc.loop combined(kernels) vector {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop vector(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop vector device_type(nvidia, radeon) vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: acc.loop combined(kernels) vector([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop vector(N) device_type(nvidia, radeon) vector
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) vector([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[N_CONV]] : si32) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop vector(N) device_type(nvidia, radeon) vector(N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD2]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_CONV]] : si32, %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop device_type(nvidia, radeon) vector(length:N + 1)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[ONE_CONST:.*]] = cir.const #cir.int<1> : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE:.*]] = cir.binop(add, %[[N_LOAD]], %[[ONE_CONST]]) nsw : !s32i
+ // CHECK-NEXT: %[[N_PLUS_ONE_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_PLUS_ONE]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) vector(%[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<nvidia>], %[[N_PLUS_ONE_CONV]] : si32 [#acc.device_type<radeon>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
+
+#pragma acc kernels loop worker(N) vector(N) device_type(nvidia) worker(N) vector(N)
+ for(unsigned I = 0; I < N; ++I);
+ // CHECK-NEXT: acc.kernels combined(loop) {
+ // CHECK-NEXT: %[[N_LOAD:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD2:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV2:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD2]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD3:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV3:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD3]] : !s32i to si32
+ // CHECK-NEXT: %[[N_LOAD4:.*]] = cir.load %[[ALLOCA_N]] : !cir.ptr<!s32i>, !s32i
+ // CHECK-NEXT: %[[N_CONV4:.*]] = builtin.unrealized_conversion_cast %[[N_LOAD4]] : !s32i to si32
+ // CHECK-NEXT: acc.loop combined(kernels) worker(%[[N_CONV]] : si32, %[[N_CONV3]] : si32 [#acc.device_type<nvidia>]) vector(%[[N_CONV2]] : si32, %[[N_CONV4]] : si32 [#acc.device_type<nvidia>]) {
+ // CHECK: acc.yield
+ // CHECK-NEXT: } loc
+ // CHECK: acc.terminator
+ // CHECK-NEXT: } loc
}
More information about the cfe-commits
mailing list