[Openmp-commits] [clang] [openmp] [OpenMP 6.0 ]Codegen for Reduction over private variables with reduction clause (PR #134709)
CHANDRA GHALE via Openmp-commits
openmp-commits at lists.llvm.org
Fri May 9 04:36:39 PDT 2025
https://github.com/chandraghale updated https://github.com/llvm/llvm-project/pull/134709
>From a05af192052de8503fb4945bfb853b3f2c14e4c9 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Mon, 7 Apr 2025 13:58:25 -0500
Subject: [PATCH 01/18] Codegen for Reduction over private variables with
reduction clause
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 147 +++++++++++
clang/lib/CodeGen/CGOpenMPRuntime.h | 14 ++
clang/lib/CodeGen/CGStmtOpenMP.cpp | 12 +-
.../OpenMP/for_private_reduction_codegen.cpp | 236 ++++++++++++++++++
4 files changed, 406 insertions(+), 3 deletions(-)
create mode 100644 clang/test/OpenMP/for_private_reduction_codegen.cpp
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 06a652c146fb9..3424227e5da79 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4899,6 +4899,150 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF,
}
}
+void CGOpenMPRuntime::emitPrivateReduction(
+ CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
+ ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
+ ArrayRef<const Expr *> ReductionOps) {
+
+ if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty())
+ return;
+
+ if (LHSExprs.size() != Privates.size() ||
+ LHSExprs.size() != ReductionOps.size())
+ return;
+
+ QualType PrivateType = Privates[0]->getType();
+ llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
+
+ BinaryOperatorKind MainBO = BO_Comma;
+ if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[0])) {
+ if (const auto *RHSExpr = BinOp->getRHS()) {
+ if (const auto *BORHS =
+ dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
+ MainBO = BORHS->getOpcode();
+ }
+ }
+ }
+
+ llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType);
+ const Expr *Private = Privates[0];
+
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Private)) {
+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+ if (const Expr *Init = VD->getInit()) {
+ if (Init->isConstantInitializer(CGF.getContext(), false)) {
+ Expr::EvalResult Result;
+ if (Init->EvaluateAsRValue(Result, CGF.getContext())) {
+ APValue &InitValue = Result.Val;
+ if (InitValue.isInt()) {
+ InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt());
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Create an internal shared variable
+ std::string SharedName = getName({"internal_private_var"});
+ llvm::GlobalVariable *SharedVar = new llvm::GlobalVariable(
+ CGM.getModule(), LLVMType, false, llvm::GlobalValue::CommonLinkage,
+ InitVal, ".omp.reduction." + SharedName, nullptr,
+ llvm::GlobalVariable::NotThreadLocal);
+
+ SharedVar->setAlignment(
+ llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8));
+
+ Address SharedResult(SharedVar, SharedVar->getValueType(),
+ CGF.getContext().getTypeAlignInChars(PrivateType));
+
+ llvm::Value *ThreadId = getThreadID(CGF, Loc);
+ llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
+ llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId};
+
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ llvm::BasicBlock *InitBB = CGF.createBasicBlock("init");
+ llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end");
+
+ llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ(
+ ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0));
+ CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
+
+ CGF.EmitBlock(InitBB);
+ CGF.Builder.CreateStore(InitVal, SharedResult);
+ CGF.Builder.CreateBr(InitEndBB);
+
+ CGF.EmitBlock(InitEndBB);
+
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ for (unsigned I = 0; I < ReductionOps.size(); ++I) {
+ if (I >= LHSExprs.size()) {
+ break;
+ }
+
+ const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[I]);
+ if (!BinOp || BinOp->getOpcode() != BO_Assign)
+ continue;
+
+ const Expr *RHSExpr = BinOp->getRHS();
+ if (!RHSExpr)
+ continue;
+
+ BinaryOperatorKind BO = BO_Comma;
+ if (const auto *BORHS =
+ dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
+ BO = BORHS->getOpcode();
+ }
+
+ LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+ LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
+ RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc);
+ auto &&UpdateOp = [&CGF, PrivateRV, BinOp, BO](RValue OldVal) {
+ if (BO == BO_Mul) {
+ llvm::Value *OldScalar = OldVal.getScalarVal();
+ llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
+ llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar);
+ return RValue::get(Result);
+ } else {
+ OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(),
+ BinOp->getLHS()->getType(),
+ ExprValueKind::VK_PRValue);
+ CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal);
+ return CGF.EmitAnyExpr(BinOp->getRHS());
+ }
+ };
+
+ (void)CGF.EmitOMPAtomicSimpleUpdateExpr(
+ SharedLV, PrivateRV, BO, true,
+ llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
+ }
+
+ // Final barrier
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+
+ // Broadcast final result
+ llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult);
+
+ // Update private variables with final result
+ for (unsigned I = 0; I < Privates.size(); ++I) {
+ LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
+ CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress());
+ }
+
+ // Final synchronization
+ CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+ CGM.getModule(), OMPRTL___kmpc_barrier),
+ BarrierArgs);
+}
+
void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs,
@@ -5201,6 +5345,9 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
CGF.EmitBranch(DefaultBB);
CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
+ if (Options.IsPrivateVarReduction) {
+ emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps);
+ }
}
/// Generates unique name for artificial threadprivate variables.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 4321712e1521d..50ba28b565b6d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1201,8 +1201,22 @@ class CGOpenMPRuntime {
struct ReductionOptionsTy {
bool WithNowait;
bool SimpleReduction;
+ bool IsPrivateVarReduction;
OpenMPDirectiveKind ReductionKind;
};
+
+ /// Emits code for private variable reduction
+ /// \param Privates List of private copies for original reduction arguments.
+ /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+ /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+ /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+ /// or 'operator binop(LHS, RHS)'.
+ void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc,
+ ArrayRef<const Expr *> Privates,
+ ArrayRef<const Expr *> LHSExprs,
+ ArrayRef<const Expr *> RHSExprs,
+ ArrayRef<const Expr *> ReductionOps);
+
/// Emit a code for reduction clause. Next code should be emitted for
/// reduction:
/// \code
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index e4d1db264aac9..720a88e075ddd 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1470,6 +1470,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
llvm::SmallVector<const Expr *, 8> LHSExprs;
llvm::SmallVector<const Expr *, 8> RHSExprs;
llvm::SmallVector<const Expr *, 8> ReductionOps;
+ llvm::SmallVector<bool, 8> IsPrivate;
bool HasAtLeastOneReduction = false;
bool IsReductionWithTaskMod = false;
for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
@@ -1480,6 +1481,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
Privates.append(C->privates().begin(), C->privates().end());
LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
+ IsPrivate.append(C->private_var_reduction_flags().begin(),
+ C->private_var_reduction_flags().end());
ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
IsReductionWithTaskMod =
IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task;
@@ -1499,9 +1502,11 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
bool SimpleReduction = ReductionKind == OMPD_simd;
// Emit nowait reduction if nowait clause is present or directive is a
// parallel directive (it always has implicit barrier).
+ bool IsPrivateVarReduction =
+ llvm::any_of(IsPrivate, [](bool IsPriv) { return IsPriv; });
CGM.getOpenMPRuntime().emitReduction(
*this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps,
- {WithNowait, SimpleReduction, ReductionKind});
+ {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind});
}
}
@@ -3943,7 +3948,8 @@ static void emitScanBasedDirective(
PrivScope.Privatize();
CGF.CGM.getOpenMPRuntime().emitReduction(
CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
- {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown});
+ {/*WithNowait=*/true, /*SimpleReduction=*/true,
+ /*IsPrivateVarReduction */ false, OMPD_unknown});
}
llvm::Value *NextIVal =
CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1));
@@ -5747,7 +5753,7 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) {
}
CGM.getOpenMPRuntime().emitReduction(
*this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
- {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd});
+ {/*WithNowait=*/true, /*SimpleReduction=*/true, false, OMPD_simd});
for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) {
const Expr *PrivateExpr = Privates[I];
LValue DestLVal;
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
new file mode 100644
index 0000000000000..be50991ca193e
--- /dev/null
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -0,0 +1,236 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --prefix-filecheck-ir-name _ --version 5
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+#define N 10
+void do_red(int n, int *v, int &sum_v)
+ {
+ sum_v = 0;
+ #pragma omp for reduction(original(private),+: sum_v)
+ for (int i = 0; i < n; i++)
+ {
+ sum_v += v[i];
+ }
+ }
+ int main(void)
+ {
+ int v[N];
+ for (int i = 0; i < N; i++)
+ v[i] = i;
+ #pragma omp parallel num_threads(4)
+ {
+ int s_v;
+ do_red(N, v, s_v);
+ }
+ return 0;
+ }
+//.
+// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
+// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8
+// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.omp.reduction..internal_private_var = common global i32 0, align 4
+// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+//.
+// CHECK-LABEL: define dso_local void @_Z6do_rediPiRi(
+// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]]
+// CHECK: [[OMP_PRECOND_THEN]]:
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT: store i32 0, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[_TMP5]], align 8
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK-NEXT: br i1 [[CMP7]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK: [[COND_TRUE]]:
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT: br label %[[COND_END:.*]]
+// CHECK: [[COND_FALSE]]:
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK: [[COND_END]]:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], %[[COND_TRUE]] ], [ [[TMP11]], %[[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK: [[OMP_INNER_FOR_COND]]:
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK-NEXT: br i1 [[CMP8]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CHECK: [[OMP_INNER_FOR_BODY]]:
+// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I6]], align 4
+// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I6]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]]
+// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8
+// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]]
+// CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4
+// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK: [[OMP_BODY_CONTINUE]]:
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK: [[OMP_INNER_FOR_INC]]:
+// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]]
+// CHECK: [[OMP_INNER_FOR_END]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK: [[OMP_LOOP_EXIT]]:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8
+// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [
+// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]]
+// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]]
+// CHECK-NEXT: ]
+// CHECK: [[_OMP_REDUCTION_CASE1:.*:]]
+// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_CASE2:.*:]]
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]]
+// CHECK: [[INIT]]:
+// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: br label %[[INIT_END]]
+// CHECK: [[INIT_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var, i32 [[TMP29]] seq_cst, align 4
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: br label %[[OMP_PRECOND_END]]
+// CHECK: [[OMP_PRECOND_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func(
+// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @main(
+// CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK-NEXT: br label %[[FOR_COND:.*]]
+// CHECK: [[FOR_COND]]:
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK: [[FOR_BODY]]:
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT: store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT: br label %[[FOR_INC:.*]]
+// CHECK: [[FOR_INC]]:
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK: [[FOR_END]]:
+// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @main.omp_outlined, ptr [[V]])
+// CHECK-NEXT: ret i32 0
+//
+//
+// CHECK-LABEL: define internal void @main.omp_outlined(
+// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[S_V:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0
+// CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]])
+// CHECK-NEXT: ret void
>From 4e6eea6ff066320a584d606ab258c375b1d887be Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Tue, 8 Apr 2025 11:26:53 -0500
Subject: [PATCH 02/18] review comment changes incorporated
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 3424227e5da79..13b070f898a1c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4981,7 +4981,8 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGM.getModule(), OMPRTL___kmpc_barrier),
BarrierArgs);
- for (unsigned I = 0; I < ReductionOps.size(); ++I) {
+ for (unsigned I :
+ llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) {
if (I >= LHSExprs.size()) {
break;
}
@@ -5003,7 +5004,7 @@ void CGOpenMPRuntime::emitPrivateReduction(
LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc);
- auto &&UpdateOp = [&CGF, PrivateRV, BinOp, BO](RValue OldVal) {
+ auto UpdateOp = [&](RValue OldVal) {
if (BO == BO_Mul) {
llvm::Value *OldScalar = OldVal.getScalarVal();
llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
@@ -5032,7 +5033,7 @@ void CGOpenMPRuntime::emitPrivateReduction(
llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult);
// Update private variables with final result
- for (unsigned I = 0; I < Privates.size(); ++I) {
+ for (unsigned I : llvm::seq<unsigned>(Privates.size())) {
LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress());
}
@@ -5345,9 +5346,8 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
CGF.EmitBranch(DefaultBB);
CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
- if (Options.IsPrivateVarReduction) {
+ if (Options.IsPrivateVarReduction)
emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps);
- }
}
/// Generates unique name for artificial threadprivate variables.
>From 18e1708275ffbeec5a68b264f8c584b3c9a72704 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 9 Apr 2025 11:28:59 -0500
Subject: [PATCH 03/18] review comment , removing redundant code
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 3 ---
1 file changed, 3 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 13b070f898a1c..93d7280408002 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4983,9 +4983,6 @@ void CGOpenMPRuntime::emitPrivateReduction(
for (unsigned I :
llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) {
- if (I >= LHSExprs.size()) {
- break;
- }
const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[I]);
if (!BinOp || BinOp->getOpcode() != BO_Assign)
>From 59ab4be637f31f04e684fe759d287841c9a11746 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Thu, 10 Apr 2025 11:44:44 -0500
Subject: [PATCH 04/18] fix for user-defined reduction op
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 93d7280408002..3fd0a0489f38e 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4993,9 +4993,11 @@ void CGOpenMPRuntime::emitPrivateReduction(
continue;
BinaryOperatorKind BO = BO_Comma;
- if (const auto *BORHS =
- dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
+ const Expr *StripRHS = RHSExpr->IgnoreParenImpCasts();
+ if (const auto *BORHS = dyn_cast<BinaryOperator>(StripRHS)) {
BO = BORHS->getOpcode();
+ } else if (const auto *OpCall = dyn_cast<CXXOperatorCallExpr>(StripRHS)) {
+ BO = BinaryOperator::getOverloadedOpcode(OpCall->getOperator());
}
LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
>From e45c30a3e6489d2c40f4267c4444d557683be6a7 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Thu, 1 May 2025 06:10:20 -0500
Subject: [PATCH 05/18] Handle user-defined reduction and updated lit test
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 134 ++++---
.../OpenMP/for_private_reduction_codegen.cpp | 344 ++++++++++++++++--
2 files changed, 393 insertions(+), 85 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 3fd0a0489f38e..bea9f6af080dd 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4914,30 +4914,22 @@ void CGOpenMPRuntime::emitPrivateReduction(
QualType PrivateType = Privates[0]->getType();
llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
- BinaryOperatorKind MainBO = BO_Comma;
- if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[0])) {
- if (const auto *RHSExpr = BinOp->getRHS()) {
- if (const auto *BORHS =
- dyn_cast<BinaryOperator>(RHSExpr->IgnoreParenImpCasts())) {
- MainBO = BORHS->getOpcode();
- }
- }
- }
-
llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType);
- const Expr *Private = Privates[0];
-
- if (const auto *DRE = dyn_cast<DeclRefExpr>(Private)) {
+ const Expr *InitExpr = nullptr;
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
- if (const Expr *Init = VD->getInit()) {
- if (Init->isConstantInitializer(CGF.getContext(), false)) {
+ InitExpr = VD->getInit();
+ if (InitExpr && !PrivateType->isAggregateType()) {
+ if (InitExpr->isConstantInitializer(CGF.getContext(), false)) {
Expr::EvalResult Result;
- if (Init->EvaluateAsRValue(Result, CGF.getContext())) {
+ if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) {
APValue &InitValue = Result.Val;
if (InitValue.isInt()) {
InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt());
}
}
+ } else {
+ InitVal = llvm::Constant::getNullValue(LLVMType);
}
}
}
@@ -4972,7 +4964,25 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
CGF.EmitBlock(InitBB);
- CGF.Builder.CreateStore(InitVal, SharedResult);
+ if (InitExpr) {
+ RValue RV = CGF.EmitAnyExpr(InitExpr);
+ if (RV.isAggregate()) {
+ CGF.Builder.CreateMemCpy(SharedResult, RV.getAggregateAddress(),
+ llvm::ConstantInt::get(CGF.IntPtrTy, 4),
+ /*IsVolatile=*/false);
+ } else {
+ CGF.Builder.CreateStore(RV.getScalarVal(), SharedResult);
+ }
+ } else {
+ if (PrivateType->isAggregateType()) {
+ CGF.Builder.CreateMemSet(SharedResult,
+ llvm::ConstantInt::get(CGM.Int8Ty, 0),
+ llvm::ConstantInt::get(CGF.IntPtrTy, 4),
+ /*IsVolatile=*/false);
+ } else {
+ CGF.Builder.CreateStore(InitVal, SharedResult);
+ }
+ }
CGF.Builder.CreateBr(InitEndBB);
CGF.EmitBlock(InitEndBB);
@@ -4983,46 +4993,75 @@ void CGOpenMPRuntime::emitPrivateReduction(
for (unsigned I :
llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) {
+ const Expr *ReductionClauseExpr = ReductionOps[I]->IgnoreParenCasts();
+ if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr))
+ ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
+ const Expr *AssignRHS = nullptr;
+ const Expr *AssignLHS = nullptr;
+
+ if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) {
+ if (BinOp->getOpcode() == BO_Assign) {
+ AssignLHS = BinOp->getLHS();
+ AssignRHS = BinOp->getRHS();
+ }
+ } else if (const auto *OpCall =
+ dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
+ if (OpCall->getOperator() == OO_Equal) {
+ AssignLHS = OpCall->getArg(0);
+ AssignRHS = OpCall->getArg(1);
+ }
+ }
- const auto *BinOp = dyn_cast<BinaryOperator>(ReductionOps[I]);
- if (!BinOp || BinOp->getOpcode() != BO_Assign)
- continue;
-
- const Expr *RHSExpr = BinOp->getRHS();
- if (!RHSExpr)
+ if (!AssignRHS || !AssignLHS) {
continue;
+ }
- BinaryOperatorKind BO = BO_Comma;
- const Expr *StripRHS = RHSExpr->IgnoreParenImpCasts();
- if (const auto *BORHS = dyn_cast<BinaryOperator>(StripRHS)) {
- BO = BORHS->getOpcode();
- } else if (const auto *OpCall = dyn_cast<CXXOperatorCallExpr>(StripRHS)) {
- BO = BinaryOperator::getOverloadedOpcode(OpCall->getOperator());
+ const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts();
+ if (const auto *MTE =
+ dyn_cast<MaterializeTemporaryExpr>(ReductionCombinerExpr)) {
+ ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts();
}
+ BinaryOperatorKind BO = BO_Assign;
LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc);
- auto UpdateOp = [&](RValue OldVal) {
- if (BO == BO_Mul) {
- llvm::Value *OldScalar = OldVal.getScalarVal();
- llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
- llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar);
- return RValue::get(Result);
- } else {
- OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(),
- BinOp->getLHS()->getType(),
- ExprValueKind::VK_PRValue);
- CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal);
- return CGF.EmitAnyExpr(BinOp->getRHS());
- }
- };
+ if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionCombinerExpr)) {
+ BO = BinOp->getOpcode();
+ auto UpdateOp = [&](RValue OldVal) {
+ if (BO == BO_Mul) {
+ llvm::Value *OldScalar = OldVal.getScalarVal();
+ llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
+ llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar);
+ return RValue::get(Result);
+ } else {
+ OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(),
+ BinOp->getLHS()->getType(),
+ ExprValueKind::VK_PRValue);
+ CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal);
+ return CGF.EmitAnyExpr(BinOp->getRHS());
+ }
+ };
- (void)CGF.EmitOMPAtomicSimpleUpdateExpr(
- SharedLV, PrivateRV, BO, true,
- llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
+ (void)CGF.EmitOMPAtomicSimpleUpdateExpr(
+ SharedLV, PrivateRV, BO, true,
+ llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
+ } else if (const auto *OpCall = dyn_cast<CallExpr>(ReductionClauseExpr)) {
+ auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ CharUnits Alignment = CGF.getContext().getTypeAlignInChars(PrivateType);
+ Address TempResult =
+ CGF.CreateMemTemp(PrivateType, "reduction.temp.result");
+ ReturnValueSlot RVS(TempResult, /*IsVolatile=*/false);
+ RValue ResultRV = CGF.EmitCallExpr(OpCall, RVS, nullptr);
+ CGF.Builder.CreateMemCpy(SharedResult, ResultRV.getAggregateAddress(),
+ llvm::ConstantInt::get(CGF.IntPtrTy, 4),
+ Alignment.getQuantity());
+ };
+ std::string CriticalName = getName({"reduction_critical"});
+ emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
+ }
}
-
// Final barrier
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier),
@@ -5042,7 +5081,6 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGM.getModule(), OMPRTL___kmpc_barrier),
BarrierArgs);
}
-
void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs,
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
index be50991ca193e..dcacc4140bbdb 100644
--- a/clang/test/OpenMP/for_private_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -2,38 +2,307 @@
// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s
// expected-no-diagnostics
#define N 10
+class Sum {
+ int val;
+public:
+ Sum(int v = 0) : val(v) {}
+ Sum operator+(const Sum& rhs) const {
+ return Sum(val + rhs.val);
+ }
+};
+
+void func_red(){
+ Sum result(0);
+ Sum array[N];
+
+ for(int i = 0; i < 10; i++) {
+ array[i] = Sum(i);
+ }
+
+ #pragma omp parallel private(result) num_threads(4)
+ {
+ #pragma omp for reduction(+:result)
+ for(int i = 0; i < 10; i++) {
+ result = result + array[i];
+ }
+ }
+}
+
void do_red(int n, int *v, int &sum_v)
{
- sum_v = 0;
- #pragma omp for reduction(original(private),+: sum_v)
- for (int i = 0; i < n; i++)
- {
- sum_v += v[i];
- }
+ sum_v = 0;
+ #pragma omp for reduction(original(private),+: sum_v)
+ for (int i = 0; i < n; i++)
+ {
+ sum_v += v[i];
+ }
}
int main(void)
{
- int v[N];
- for (int i = 0; i < N; i++)
- v[i] = i;
- #pragma omp parallel num_threads(4)
- {
- int s_v;
- do_red(N, v, s_v);
- }
- return 0;
+ int v[N];
+ for (int i = 0; i < N; i++)
+ v[i] = i;
+ #pragma omp parallel num_threads(4)
+ {
+ int s_v;
+ do_red(N, v, s_v);
+ }
+ return 0;
}
//.
// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
-// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
// CHECK: @.gomp_critical_user_.reduction.var = common global [8 x i32] zeroinitializer, align 8
-// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
-// CHECK: @.omp.reduction..internal_private_var = common global i32 0, align 4
+// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8
+// CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4
+// CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8
// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
+// CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4
//.
+// CHECK-LABEL: define dso_local void @_Z8func_redv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*]]:
+// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT: [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
+// CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0
+// CHECK-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10
+// CHECK-NEXT: br label %[[ARRAYCTOR_LOOP:.*]]
+// CHECK: [[ARRAYCTOR_LOOP]]:
+// CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], %[[ENTRY]] ], [ [[ARRAYCTOR_NEXT:%.*]], %[[ARRAYCTOR_LOOP]] ]
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0)
+// CHECK-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK-NEXT: br i1 [[ARRAYCTOR_DONE]], label %[[ARRAYCTOR_CONT:.*]], label %[[ARRAYCTOR_LOOP]]
+// CHECK: [[ARRAYCTOR_CONT]]:
+// CHECK-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK-NEXT: br label %[[FOR_COND:.*]]
+// CHECK: [[FOR_COND]]:
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY:.*]], label %[[FOR_END:.*]]
+// CHECK: [[FOR_BODY]]:
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]])
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT: br label %[[FOR_INC:.*]]
+// CHECK: [[FOR_INC]]:
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK: [[FOR_END]]:
+// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define linkonce_odr void @_ZN3SumC1Ei(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
+// CHECK-NEXT: call void @_ZN3SumC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined(
+// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[ARRAY_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT: [[REF_TMP4:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[REF_TMP7:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[AGG_TEMP:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[REDUCTION_TEMP_RESULT:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: [[REF_TMP10:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAY_ADDR]], align 8
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], i32 noundef 0)
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-NEXT: br i1 [[CMP]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK: [[COND_TRUE]]:
+// CHECK-NEXT: br label %[[COND_END:.*]]
+// CHECK: [[COND_FALSE]]:
+// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK: [[COND_END]]:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[TMP4]], %[[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK: [[OMP_INNER_FOR_COND]]:
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-NEXT: br i1 [[CMP2]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CHECK: [[OMP_INNER_FOR_BODY]]:
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX]])
+// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK: [[OMP_BODY_CONTINUE]]:
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK: [[OMP_INNER_FOR_INC]]:
+// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]]
+// CHECK: [[OMP_INNER_FOR_END]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK: [[OMP_LOOP_EXIT]]:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[RESULT1]], ptr [[TMP11]], align 8
+// CHECK-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP2]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: switch i32 [[TMP12]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [
+// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]]
+// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]]
+// CHECK-NEXT: ]
+// CHECK: [[_OMP_REDUCTION_CASE1:.*:]]
+// CHECK-NEXT: [[CALL5:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]])
+// CHECK-NEXT: [[COERCE_DIVE6:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP4]], i32 0, i32 0
+// CHECK-NEXT: store i32 [[CALL5]], ptr [[COERCE_DIVE6]], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP4]], i64 4, i1 false)
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_CASE2:.*:]]
+// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var)
+// CHECK-NEXT: [[CALL8:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]])
+// CHECK-NEXT: [[COERCE_DIVE9:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP7]], i32 0, i32 0
+// CHECK-NEXT: store i32 [[CALL8]], ptr [[COERCE_DIVE9]], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP7]], i64 4, i1 false)
+// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var)
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0
+// CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]]
+// CHECK: [[INIT]]:
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[AGG_TEMP]], i32 noundef 0)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 4 [[AGG_TEMP]], i64 4, i1 false)
+// CHECK-NEXT: br label %[[INIT_END]]
+// CHECK: [[INIT_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr [[RESULT]], align 4
+// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: [[CALL11:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]])
+// CHECK-NEXT: [[COERCE_DIVE12:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP10]], i32 0, i32 0
+// CHECK-NEXT: store i32 [[CALL11]], ptr [[COERCE_DIVE12]], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP10]], i64 4, i1 false)
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 16 [[RESULT]], i64 4, i1 true)
+// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT: [[TMP15:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: store [[CLASS_SUM]] [[TMP15]], ptr [[RESULT]], align 4
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define linkonce_odr i32 @_ZNK3SumplERKS_(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[TMP1]], i32 0, i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]]
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RETVAL]], i32 noundef [[ADD]])
+// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT: ret i32 [[TMP3]]
+//
+//
+// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined.omp.reduction.reduction_func(
+// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP7]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP5]])
+// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define linkonce_odr void @_ZN3SumC2Ei(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP0]], ptr [[VAL]], align 4
+// CHECK-NEXT: ret void
+//
+//
// CHECK-LABEL: define dso_local void @_Z6do_rediPiRi(
-// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
@@ -52,7 +321,7 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
@@ -130,7 +399,7 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[TMP22]], align 8
-// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB3]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: [[TMP23:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: switch i32 [[TMP23]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [
// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]]
// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]]
@@ -140,28 +409,28 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4
// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
// CHECK-NEXT: store i32 [[ADD11]], ptr [[TMP7]], align 4
-// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: [[_OMP_REDUCTION_CASE2:.*:]]
// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[SUM_V4]], align 4
// CHECK-NEXT: [[TMP27:%.*]] = atomicrmw add ptr [[TMP7]], i32 [[TMP26]] monotonic, align 4
-// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
-// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0
// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]]
// CHECK: [[INIT]]:
-// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var.1, align 4
// CHECK-NEXT: br label %[[INIT_END]]
// CHECK: [[INIT_END]]:
-// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4
-// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var, i32 [[TMP29]] seq_cst, align 4
-// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
-// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var.1, i32 [[TMP29]] seq_cst, align 4
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4
// CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4
-// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB3]], i32 [[TMP0]])
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: br label %[[OMP_PRECOND_END]]
// CHECK: [[OMP_PRECOND_END]]:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
@@ -169,7 +438,7 @@ void do_red(int n, int *v, int &sum_v)
//
//
// CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func(
-// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -189,12 +458,12 @@ void do_red(int n, int *v, int &sum_v)
//
//
// CHECK-LABEL: define dso_local noundef i32 @main(
-// CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+// CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
-// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4
// CHECK-NEXT: store i32 0, ptr [[I]], align 4
// CHECK-NEXT: br label %[[FOR_COND:.*]]
@@ -213,15 +482,15 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4
// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1
// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4
-// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK-NEXT: br label %[[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
// CHECK: [[FOR_END]]:
-// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB2]], i32 [[TMP0]], i32 4)
-// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 1, ptr @main.omp_outlined, ptr [[V]])
+// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]])
// CHECK-NEXT: ret i32 0
//
//
// CHECK-LABEL: define internal void @main.omp_outlined(
-// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-SAME: ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[V:%.*]]) #[[ATTR2]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -234,3 +503,4 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0
// CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]])
// CHECK-NEXT: ret void
+
>From 980bc06bd8bb1a733f3df4a4b33629105058fd59 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Thu, 1 May 2025 12:54:30 -0500
Subject: [PATCH 06/18] conditional checks
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index bea9f6af080dd..3627ba30cf8d7 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -5054,9 +5054,15 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGF.CreateMemTemp(PrivateType, "reduction.temp.result");
ReturnValueSlot RVS(TempResult, /*IsVolatile=*/false);
RValue ResultRV = CGF.EmitCallExpr(OpCall, RVS, nullptr);
- CGF.Builder.CreateMemCpy(SharedResult, ResultRV.getAggregateAddress(),
- llvm::ConstantInt::get(CGF.IntPtrTy, 4),
- Alignment.getQuantity());
+ if (ResultRV.isAggregate()) {
+ CGF.Builder.CreateMemCpy(SharedResult, ResultRV.getAggregateAddress(),
+ llvm::ConstantInt::get(CGF.IntPtrTy, 4),
+ Alignment.getQuantity());
+ } else {
+ CGF.Builder.CreateStore(ResultRV.getScalarVal(),
+ SharedLV.getAddress(),
+ /*IsVolatile=*/false);
+ }
};
std::string CriticalName = getName({"reduction_critical"});
emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
>From a103dfa2cb7f9176fea411e15801e6b45d4271ac Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Thu, 1 May 2025 13:01:00 -0500
Subject: [PATCH 07/18] lit update
---
clang/test/OpenMP/for_private_reduction_codegen.cpp | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
index dcacc4140bbdb..4995ac42a2242 100644
--- a/clang/test/OpenMP/for_private_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -49,6 +49,7 @@ void do_red(int n, int *v, int &sum_v)
}
return 0;
}
+
//.
// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
@@ -235,7 +236,7 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[COERCE_DIVE12:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP10]], i32 0, i32 0
// CHECK-NEXT: store i32 [[CALL11]], ptr [[COERCE_DIVE12]], align 4
// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP10]], i64 4, i1 false)
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 16 [[RESULT]], i64 4, i1 true)
+// CHECK-NEXT: store ptr [[RESULT]], ptr @.omp.reduction..internal_private_var, align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP15:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4
>From 526314c4210b7c09ccba882850e209ed756d32b5 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Mon, 5 May 2025 10:48:25 -0500
Subject: [PATCH 08/18] Support for UDR for private variables
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 221 ++++++++++--------
.../OpenMP/for_private_reduction_codegen.cpp | 101 +++++---
2 files changed, 190 insertions(+), 132 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 3627ba30cf8d7..71742d24a33ba 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4903,7 +4903,6 @@ void CGOpenMPRuntime::emitPrivateReduction(
CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
ArrayRef<const Expr *> ReductionOps) {
-
if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty())
return;
@@ -4914,13 +4913,16 @@ void CGOpenMPRuntime::emitPrivateReduction(
QualType PrivateType = Privates[0]->getType();
llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
- llvm::Constant *InitVal = llvm::Constant::getNullValue(LLVMType);
- const Expr *InitExpr = nullptr;
- if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
- if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
- InitExpr = VD->getInit();
- if (InitExpr && !PrivateType->isAggregateType()) {
- if (InitExpr->isConstantInitializer(CGF.getContext(), false)) {
+ llvm::Constant *InitVal = nullptr;
+ const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps[0]);
+
+ if (!UDR) {
+ InitVal = llvm::Constant::getNullValue(LLVMType);
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+ const Expr *InitExpr = VD->getInit();
+ if (InitExpr && !PrivateType->isAggregateType() &&
+ !PrivateType->isAnyComplexType()) {
Expr::EvalResult Result;
if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) {
APValue &InitValue = Result.Val;
@@ -4928,11 +4930,11 @@ void CGOpenMPRuntime::emitPrivateReduction(
InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt());
}
}
- } else {
- InitVal = llvm::Constant::getNullValue(LLVMType);
}
}
}
+ } else {
+ InitVal = llvm::Constant::getNullValue(LLVMType);
}
// Create an internal shared variable
@@ -4964,108 +4966,121 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
CGF.EmitBlock(InitBB);
- if (InitExpr) {
- RValue RV = CGF.EmitAnyExpr(InitExpr);
- if (RV.isAggregate()) {
- CGF.Builder.CreateMemCpy(SharedResult, RV.getAggregateAddress(),
- llvm::ConstantInt::get(CGF.IntPtrTy, 4),
- /*IsVolatile=*/false);
- } else {
- CGF.Builder.CreateStore(RV.getScalarVal(), SharedResult);
- }
+ if (UDR) {
+ Address OrigAddr = Address::invalid();
+ emitInitWithReductionInitializer(CGF, UDR, ReductionOps[0], SharedResult,
+ OrigAddr, PrivateType);
+
} else {
- if (PrivateType->isAggregateType()) {
- CGF.Builder.CreateMemSet(SharedResult,
- llvm::ConstantInt::get(CGM.Int8Ty, 0),
- llvm::ConstantInt::get(CGF.IntPtrTy, 4),
- /*IsVolatile=*/false);
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+ const Expr *InitExpr = VD->getInit();
+ if (InitExpr && (PrivateType->isAggregateType() ||
+ PrivateType->isAnyComplexType())) {
+ CGF.EmitAnyExprToMem(InitExpr, SharedResult,
+ PrivateType.getQualifiers(),
+ /*IsInitializer=*/true);
+ } else if (!InitVal->isNullValue()) {
+ CGF.EmitStoreOfScalar(InitVal,
+ CGF.MakeAddrLValue(SharedResult, PrivateType));
+ } else {
+ CGF.EmitNullInitialization(SharedResult, PrivateType);
+ }
+ } else {
+ CGF.EmitNullInitialization(SharedResult, PrivateType);
+ }
} else {
- CGF.Builder.CreateStore(InitVal, SharedResult);
+ CGF.EmitNullInitialization(SharedResult, PrivateType);
}
}
- CGF.Builder.CreateBr(InitEndBB);
+ CGF.Builder.CreateBr(InitEndBB);
CGF.EmitBlock(InitEndBB);
-
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier),
BarrierArgs);
-
for (unsigned I :
llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) {
- const Expr *ReductionClauseExpr = ReductionOps[I]->IgnoreParenCasts();
- if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr))
- ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
- const Expr *AssignRHS = nullptr;
- const Expr *AssignLHS = nullptr;
-
- if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) {
- if (BinOp->getOpcode() == BO_Assign) {
- AssignLHS = BinOp->getLHS();
- AssignRHS = BinOp->getRHS();
- }
- } else if (const auto *OpCall =
- dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
- if (OpCall->getOperator() == OO_Equal) {
- AssignLHS = OpCall->getArg(0);
- AssignRHS = OpCall->getArg(1);
- }
- }
-
- if (!AssignRHS || !AssignLHS) {
- continue;
- }
- const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts();
- if (const auto *MTE =
- dyn_cast<MaterializeTemporaryExpr>(ReductionCombinerExpr)) {
- ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts();
- }
-
- BinaryOperatorKind BO = BO_Assign;
+ const Expr *ReductionOp = ReductionOps[I];
LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
- RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc);
- if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionCombinerExpr)) {
- BO = BinOp->getOpcode();
- auto UpdateOp = [&](RValue OldVal) {
- if (BO == BO_Mul) {
- llvm::Value *OldScalar = OldVal.getScalarVal();
- llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
- llvm::Value *Result = CGF.Builder.CreateMul(OldScalar, PrivateScalar);
- return RValue::get(Result);
- } else {
- OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(),
- BinOp->getLHS()->getType(),
- ExprValueKind::VK_PRValue);
- CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE, OldVal);
- return CGF.EmitAnyExpr(BinOp->getRHS());
- }
- };
-
- (void)CGF.EmitOMPAtomicSimpleUpdateExpr(
- SharedLV, PrivateRV, BO, true,
- llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
- } else if (const auto *OpCall = dyn_cast<CallExpr>(ReductionClauseExpr)) {
+ // If UDR
+ const OMPDeclareReductionDecl *CurrentUDR =
+ getReductionInit(ReductionOps[I]);
+ if (CurrentUDR) {
auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
Action.Enter(CGF);
- CharUnits Alignment = CGF.getContext().getTypeAlignInChars(PrivateType);
- Address TempResult =
- CGF.CreateMemTemp(PrivateType, "reduction.temp.result");
- ReturnValueSlot RVS(TempResult, /*IsVolatile=*/false);
- RValue ResultRV = CGF.EmitCallExpr(OpCall, RVS, nullptr);
- if (ResultRV.isAggregate()) {
- CGF.Builder.CreateMemCpy(SharedResult, ResultRV.getAggregateAddress(),
- llvm::ConstantInt::get(CGF.IntPtrTy, 4),
- Alignment.getQuantity());
- } else {
- CGF.Builder.CreateStore(ResultRV.getScalarVal(),
- SharedLV.getAddress(),
- /*IsVolatile=*/false);
+ std::pair<llvm::Function *, llvm::Function *> ReductionFnPair =
+ getUserDefinedReduction(CurrentUDR);
+ llvm::Function *CombinerFn = ReductionFnPair.first;
+ if (const auto *CE = dyn_cast<CallExpr>(ReductionOp)) {
+ if (CE && CombinerFn) {
+ const auto *CE = cast<CallExpr>(ReductionOps[I]);
+ const auto *OutDRE = cast<DeclRefExpr>(
+ cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
+ ->getSubExpr());
+ const auto *InDRE = cast<DeclRefExpr>(
+ cast<UnaryOperator>(CE->getArg(1)->IgnoreParenImpCasts())
+ ->getSubExpr());
+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+ LocalScope.addPrivate(cast<VarDecl>(OutDRE->getDecl()),
+ SharedLV.getAddress());
+ LocalScope.addPrivate(cast<VarDecl>(InDRE->getDecl()),
+ LHSLV.getAddress());
+ (void)LocalScope.Privatize();
+ emitReductionCombiner(CGF, ReductionOp);
+ }
}
};
std::string CriticalName = getName({"reduction_critical"});
emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
+ } else {
+ // Built-in Operator Combination
+ const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts();
+ if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr))
+ ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
+ const Expr *AssignRHS = nullptr;
+ if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) {
+ if (BinOp->getOpcode() == BO_Assign)
+ AssignRHS = BinOp->getRHS();
+ } else if (const auto *OpCall =
+ dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
+ if (OpCall->getOperator() == OO_Equal)
+ AssignRHS = OpCall->getArg(1);
+ }
+ if (!AssignRHS)
+ continue;
+ const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts();
+ if (const auto *MTE =
+ dyn_cast<MaterializeTemporaryExpr>(ReductionCombinerExpr))
+ ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts();
+
+ BinaryOperatorKind BO = BO_Assign;
+ RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc);
+ if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionCombinerExpr)) {
+ BO = BinOp->getOpcode();
+ auto UpdateOp = [&](RValue OldVal) {
+ if (BO == BO_Mul) {
+ llvm::Value *OldScalar = OldVal.getScalarVal();
+ llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
+ llvm::Value *Result =
+ CGF.Builder.CreateMul(OldScalar, PrivateScalar);
+ return RValue::get(Result);
+ } else {
+ OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(),
+ BinOp->getLHS()->getType(),
+ ExprValueKind::VK_PRValue);
+ CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE,
+ OldVal);
+ return CGF.EmitAnyExpr(BinOp->getRHS());
+ }
+ };
+
+ (void)CGF.EmitOMPAtomicSimpleUpdateExpr(
+ SharedLV, PrivateRV, BO, true,
+ llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
+ }
}
}
// Final barrier
@@ -5074,12 +5089,25 @@ void CGOpenMPRuntime::emitPrivateReduction(
BarrierArgs);
// Broadcast final result
- llvm::Value *FinalResult = CGF.Builder.CreateLoad(SharedResult);
+ bool IsAggregate = PrivateType->isAggregateType();
+ llvm::Value *FinalResultVal = nullptr;
+ LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+ Address FinalResultAddr = Address::invalid();
+ if (IsAggregate) {
+ FinalResultAddr = SharedResult;
+ } else {
+ FinalResultVal = CGF.EmitLoadOfScalar(SharedLV, Loc);
+ }
- // Update private variables with final result
for (unsigned I : llvm::seq<unsigned>(Privates.size())) {
- LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
- CGF.Builder.CreateStore(FinalResult, LHSLV.getAddress());
+ LValue TargetLHSLV = CGF.EmitLValue(LHSExprs[I]);
+ if (IsAggregate) {
+ CGF.EmitAggregateCopy(TargetLHSLV,
+ CGF.MakeAddrLValue(FinalResultAddr, PrivateType),
+ PrivateType, AggValueSlot::DoesNotOverlap, false);
+ } else {
+ CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV);
+ }
}
// Final synchronization
@@ -5087,6 +5115,7 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGM.getModule(), OMPRTL___kmpc_barrier),
BarrierArgs);
}
+
void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
ArrayRef<const Expr *> Privates,
ArrayRef<const Expr *> LHSExprs,
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
index 4995ac42a2242..eeba6c29afb54 100644
--- a/clang/test/OpenMP/for_private_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -9,20 +9,25 @@ class Sum {
Sum operator+(const Sum& rhs) const {
return Sum(val + rhs.val);
}
+ Sum& operator+=(const Sum& rhs) {
+ val += rhs.val;
+ return *this;
+ }
};
+#pragma omp declare reduction(sum_reduction : Sum : omp_out += omp_in) initializer(omp_priv = Sum(0))
void func_red(){
Sum result(0);
Sum array[N];
- for(int i = 0; i < 10; i++) {
+ for (int i = 0; i < N; i++) {
array[i] = Sum(i);
}
#pragma omp parallel private(result) num_threads(4)
{
- #pragma omp for reduction(+:result)
- for(int i = 0; i < 10; i++) {
+ #pragma omp for reduction(sum_reduction:result)
+ for (int i = 0; i < N; i++) {
result = result + array[i];
}
}
@@ -58,6 +63,7 @@ void do_red(int n, int *v, int &sum_v)
// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
// CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8
// CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4
+// CHECK: @_ZZ8func_redvE6result = internal global %class.Sum zeroinitializer, align 4
// CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8
// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
// CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4
@@ -136,11 +142,6 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK-NEXT: [[REF_TMP4:%.*]] = alloca [[CLASS_SUM]], align 4
-// CHECK-NEXT: [[REF_TMP7:%.*]] = alloca [[CLASS_SUM]], align 4
-// CHECK-NEXT: [[AGG_TEMP:%.*]] = alloca [[CLASS_SUM]], align 4
-// CHECK-NEXT: [[REDUCTION_TEMP_RESULT:%.*]] = alloca [[CLASS_SUM]], align 4
-// CHECK-NEXT: [[REF_TMP10:%.*]] = alloca [[CLASS_SUM]], align 4
// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
// CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8
@@ -150,7 +151,7 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4
// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
-// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], i32 noundef 0)
+// CHECK-NEXT: call void @.omp_initializer.(ptr noundef [[RESULT1]], ptr noundef [[RESULT]])
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -205,18 +206,12 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: i32 2, [[DOTOMP_REDUCTION_CASE2:label %.*]]
// CHECK-NEXT: ]
// CHECK: [[_OMP_REDUCTION_CASE1:.*:]]
-// CHECK-NEXT: [[CALL5:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]])
-// CHECK-NEXT: [[COERCE_DIVE6:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP4]], i32 0, i32 0
-// CHECK-NEXT: store i32 [[CALL5]], ptr [[COERCE_DIVE6]], align 4
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP4]], i64 4, i1 false)
+// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]])
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: [[_OMP_REDUCTION_CASE2:.*:]]
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var)
-// CHECK-NEXT: [[CALL8:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]])
-// CHECK-NEXT: [[COERCE_DIVE9:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP7]], i32 0, i32 0
-// CHECK-NEXT: store i32 [[CALL8]], ptr [[COERCE_DIVE9]], align 4
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP7]], i64 4, i1 false)
+// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]])
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.atomic_reduction.var)
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
@@ -225,27 +220,65 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0
// CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]]
// CHECK: [[INIT]]:
-// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[AGG_TEMP]], i32 noundef 0)
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, ptr align 4 [[AGG_TEMP]], i64 4, i1 false)
+// CHECK-NEXT: call void @.omp_initializer.(ptr noundef @.omp.reduction..internal_private_var, ptr noundef @_ZZ8func_redvE6result)
// CHECK-NEXT: br label %[[INIT_END]]
// CHECK: [[INIT_END]]:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
-// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr [[RESULT]], align 4
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
-// CHECK-NEXT: [[CALL11:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]])
-// CHECK-NEXT: [[COERCE_DIVE12:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP10]], i32 0, i32 0
-// CHECK-NEXT: store i32 [[CALL11]], ptr [[COERCE_DIVE12]], align 4
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT]], ptr align 4 [[REF_TMP10]], i64 4, i1 false)
-// CHECK-NEXT: store ptr [[RESULT]], ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_private_var, ptr noundef [[RESULT]])
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
-// CHECK-NEXT: [[TMP15:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4
-// CHECK-NEXT: store [[CLASS_SUM]] [[TMP15]], ptr [[RESULT]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: store [[CLASS_SUM]] [[TMP14]], ptr [[RESULT]], align 4
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
// CHECK-NEXT: ret void
//
//
+// CHECK-LABEL: define internal void @.omp_combiner.(
+// CHECK-SAME: ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT: [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define linkonce_odr noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(
+// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VAL]], align 4
+// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[VAL2]], align 4
+// CHECK-NEXT: ret ptr [[THIS1]]
+//
+//
+// CHECK-LABEL: define internal void @.omp_initializer.(
+// CHECK-SAME: ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], i32 noundef 0)
+// CHECK-NEXT: ret void
+//
+//
// CHECK-LABEL: define linkonce_odr i32 @_ZNK3SumplERKS_(
// CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
// CHECK-NEXT: [[ENTRY:.*:]]
@@ -268,11 +301,10 @@ void do_red(int n, int *v, int &sum_v)
//
//
// CHECK-LABEL: define internal void @_Z8func_redv.omp_outlined.omp.reduction.reduction_func(
-// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
-// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
@@ -281,10 +313,7 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP2]], i64 0, i64 0
// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP7]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP5]])
-// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0
-// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4
-// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP7]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[TMP7]], ptr noundef [[TMP5]])
// CHECK-NEXT: ret void
//
//
@@ -422,7 +451,7 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0
// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]]
// CHECK: [[INIT]]:
-// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_private_var.1, align 4
+// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var.1, i8 0, i64 4, i1 false)
// CHECK-NEXT: br label %[[INIT_END]]
// CHECK: [[INIT_END]]:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
@@ -439,7 +468,7 @@ void do_red(int n, int *v, int &sum_v)
//
//
// CHECK-LABEL: define internal void @_Z6do_rediPiRi.omp.reduction.reduction_func(
-// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -459,7 +488,7 @@ void do_red(int n, int *v, int &sum_v)
//
//
// CHECK-LABEL: define dso_local noundef i32 @main(
-// CHECK-SAME: ) #[[ATTR6:[0-9]+]] {
+// CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16
>From c77fb0e1911428ee9441f2f3c7a0bf02157a49e4 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Mon, 5 May 2025 12:44:45 -0500
Subject: [PATCH 09/18] Implicit reduction identifier fix
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 53 ++++++++++++-------
.../OpenMP/for_private_reduction_codegen.cpp | 25 +++++++--
2 files changed, 56 insertions(+), 22 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 71742d24a33ba..fb865ac9f453c 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4966,32 +4966,25 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
CGF.EmitBlock(InitBB);
- if (UDR) {
- Address OrigAddr = Address::invalid();
- emitInitWithReductionInitializer(CGF, UDR, ReductionOps[0], SharedResult,
- OrigAddr, PrivateType);
- } else {
- if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
- if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
- const Expr *InitExpr = VD->getInit();
- if (InitExpr && (PrivateType->isAggregateType() ||
- PrivateType->isAnyComplexType())) {
- CGF.EmitAnyExprToMem(InitExpr, SharedResult,
- PrivateType.getQualifiers(),
- /*IsInitializer=*/true);
- } else if (!InitVal->isNullValue()) {
- CGF.EmitStoreOfScalar(InitVal,
- CGF.MakeAddrLValue(SharedResult, PrivateType));
- } else {
- CGF.EmitNullInitialization(SharedResult, PrivateType);
- }
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+ const Expr *InitExpr = VD->getInit();
+ if (InitExpr &&
+ (PrivateType->isAggregateType() || PrivateType->isAnyComplexType())) {
+ CGF.EmitAnyExprToMem(InitExpr, SharedResult,
+ PrivateType.getQualifiers(), true);
+ } else if (!InitVal->isNullValue()) {
+ CGF.EmitStoreOfScalar(InitVal,
+ CGF.MakeAddrLValue(SharedResult, PrivateType));
} else {
CGF.EmitNullInitialization(SharedResult, PrivateType);
}
} else {
CGF.EmitNullInitialization(SharedResult, PrivateType);
}
+ } else {
+ CGF.EmitNullInitialization(SharedResult, PrivateType);
}
CGF.Builder.CreateBr(InitEndBB);
@@ -5080,6 +5073,28 @@ void CGOpenMPRuntime::emitPrivateReduction(
(void)CGF.EmitOMPAtomicSimpleUpdateExpr(
SharedLV, PrivateRV, BO, true,
llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
+ } else if (dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
+ // Implicit Reduction Identifiers ( openmp 6.0 sec 7.6.5 )
+ auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ const auto *OmpOutDRE =
+ dyn_cast<DeclRefExpr>(LHSExprs[I]->IgnoreParenImpCasts());
+ const auto *OmpInDRE =
+ dyn_cast<DeclRefExpr>(RHSExprs[I]->IgnoreParenImpCasts());
+
+ if (!OmpOutDRE || !OmpInDRE) {
+ return;
+ }
+ const VarDecl *OmpOutVD = cast<VarDecl>(OmpOutDRE->getDecl());
+ const VarDecl *OmpInVD = cast<VarDecl>(OmpInDRE->getDecl());
+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+ LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress());
+ LocalScope.addPrivate(OmpInVD, LHSLV.getAddress());
+ (void)LocalScope.Privatize();
+ CGF.EmitIgnoredExpr(ReductionOp);
+ };
+ std::string CriticalName = getName({"reduction_critical"});
+ emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
}
}
}
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
index eeba6c29afb54..313d0997f18e3 100644
--- a/clang/test/OpenMP/for_private_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -63,7 +63,6 @@ void do_red(int n, int *v, int &sum_v)
// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
// CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8
// CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4
-// CHECK: @_ZZ8func_redvE6result = internal global %class.Sum zeroinitializer, align 4
// CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8
// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
// CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4
@@ -220,7 +219,7 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0
// CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]]
// CHECK: [[INIT]]:
-// CHECK-NEXT: call void @.omp_initializer.(ptr noundef @.omp.reduction..internal_private_var, ptr noundef @_ZZ8func_redvE6result)
+// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, i8 0, i64 4, i1 false)
// CHECK-NEXT: br label %[[INIT_END]]
// CHECK: [[INIT_END]]:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
@@ -533,4 +532,24 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0
// CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]])
// CHECK-NEXT: ret void
-
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+// CHECK: attributes #[[ATTR2]] = { noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR3]] = { noinline nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind }
+// CHECK: attributes #[[ATTR5]] = { noinline norecurse nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind }
+// CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+// CHECK: attributes #[[ATTR8]] = { mustprogress noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 60}
+// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+// CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+// CHECK: [[META4]] = !{!"llvm.loop.mustprogress"}
+// CHECK: [[META5:![0-9]+]] = !{[[META6:![0-9]+]]}
+// CHECK: [[META6]] = !{i64 2, i64 -1, i64 -1, i1 true}
+// CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+//.
>From f202eaaefe3d4c905a946b1ead636defe6e0d7cf Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 7 May 2025 07:03:01 -0500
Subject: [PATCH 10/18] updated with comments, unified logic and docs
---
clang/docs/OpenMPSupport.rst | 4 +-
clang/docs/ReleaseNotes.rst | 1 +
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 206 +++++++++---------
.../OpenMP/for_private_reduction_codegen.cpp | 31 +--
4 files changed, 112 insertions(+), 130 deletions(-)
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 83d90ffef6bc7..3c5011a8809d5 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -406,7 +406,9 @@ implementation.
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| Extensions to atomic construct | :none:`unclaimed` | :none:`unclaimed` | |
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Private reductions | :part:`partial` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 |
+| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938
+| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709
+|
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| Self maps | :part:`partial` | :none:`unclaimed` | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888 |
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 159991e8db981..67d56b9cb5ec9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -530,6 +530,7 @@ OpenMP Support
- Added support 'no_openmp_constructs' assumption clause.
- Added support for 'self_maps' in map and requirement clause.
- Added support for 'omp stripe' directive.
+- Added support for reduction over private variable with 'reduction' clause.
Improvements
^^^^^^^^^^^^
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index fb865ac9f453c..982501f7214a9 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4910,12 +4910,33 @@ void CGOpenMPRuntime::emitPrivateReduction(
LHSExprs.size() != ReductionOps.size())
return;
+ // Create a shared global variable (__shared_reduction_var) to accumulate the
+ // final result.
+ //
+ // Call __kmpc_barrier to synchronize threads before initialization.
+ //
+ // The master thread (thread_id == 0) initializes __shared_reduction_var
+ // with the identity value or initializer.
+ //
+ // Call __kmpc_barrier to synchronize before combining.
+ // For each i:
+ // - Thread enters critical section.
+ // - Reads its private value from LHSExprs[i].
+ // - Updates __shared_reduction_var[i] = RedOp_i(__shared_reduction_var[i],
+ // LHSExprs[i]).
+ // - Exits critical section.
+ //
+ // Call __kmpc_barrier after combining.
+ //
+ // Each thread copies __shared_reduction_var[i] back to LHSExprs[i].
+ //
+ // Final __kmpc_barrier to synchronize after broadcasting
QualType PrivateType = Privates[0]->getType();
llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
llvm::Constant *InitVal = nullptr;
const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps[0]);
-
+ // Determine the initial value for the shared reduction variable
if (!UDR) {
InitVal = llvm::Constant::getNullValue(LLVMType);
if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
@@ -4926,9 +4947,8 @@ void CGOpenMPRuntime::emitPrivateReduction(
Expr::EvalResult Result;
if (InitExpr->EvaluateAsRValue(Result, CGF.getContext())) {
APValue &InitValue = Result.Val;
- if (InitValue.isInt()) {
+ if (InitValue.isInt())
InitVal = llvm::ConstantInt::get(LLVMType, InitValue.getInt());
- }
}
}
}
@@ -4954,10 +4974,11 @@ void CGOpenMPRuntime::emitPrivateReduction(
llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId};
+ // First barrier to ensure all threads are ready.
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier),
BarrierArgs);
-
+ // Initialize the shared variable by the master thread.
llvm::BasicBlock *InitBB = CGF.createBasicBlock("init");
llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end");
@@ -4967,28 +4988,29 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGF.EmitBlock(InitBB);
- if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
- if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
- const Expr *InitExpr = VD->getInit();
- if (InitExpr &&
- (PrivateType->isAggregateType() || PrivateType->isAnyComplexType())) {
- CGF.EmitAnyExprToMem(InitExpr, SharedResult,
- PrivateType.getQualifiers(), true);
- } else if (!InitVal->isNullValue()) {
- CGF.EmitStoreOfScalar(InitVal,
- CGF.MakeAddrLValue(SharedResult, PrivateType));
- } else {
- CGF.EmitNullInitialization(SharedResult, PrivateType);
+ auto EmitSharedInit = [&]() {
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
+ if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+ const Expr *InitExpr = VD->getInit();
+ if (InitExpr && (PrivateType->isAggregateType() ||
+ PrivateType->isAnyComplexType())) {
+ CGF.EmitAnyExprToMem(InitExpr, SharedResult,
+ PrivateType.getQualifiers(), true);
+ return;
+ }
+ if (!InitVal->isNullValue()) {
+ CGF.EmitStoreOfScalar(InitVal,
+ CGF.MakeAddrLValue(SharedResult, PrivateType));
+ return;
+ }
}
- } else {
- CGF.EmitNullInitialization(SharedResult, PrivateType);
}
- } else {
CGF.EmitNullInitialization(SharedResult, PrivateType);
- }
-
+ };
+ EmitSharedInit();
CGF.Builder.CreateBr(InitEndBB);
CGF.EmitBlock(InitEndBB);
+
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier),
BarrierArgs);
@@ -4996,20 +5018,23 @@ void CGOpenMPRuntime::emitPrivateReduction(
llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) {
const Expr *ReductionOp = ReductionOps[I];
+ const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp);
LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
- // If UDR
- const OMPDeclareReductionDecl *CurrentUDR =
- getReductionInit(ReductionOps[I]);
+
+ auto EmitCriticalReduction = [&](auto ReductionGen) {
+ std::string CriticalName = getName({"reduction_critical"});
+ emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
+ };
+
if (CurrentUDR) {
+ // Handle user-defined reduction.
auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
Action.Enter(CGF);
- std::pair<llvm::Function *, llvm::Function *> ReductionFnPair =
+ std::pair<llvm::Function *, llvm::Function *> FnPair =
getUserDefinedReduction(CurrentUDR);
- llvm::Function *CombinerFn = ReductionFnPair.first;
- if (const auto *CE = dyn_cast<CallExpr>(ReductionOp)) {
- if (CE && CombinerFn) {
- const auto *CE = cast<CallExpr>(ReductionOps[I]);
+ if (FnPair.first) {
+ if (const auto *CE = dyn_cast<CallExpr>(ReductionOp)) {
const auto *OutDRE = cast<DeclRefExpr>(
cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
->getSubExpr());
@@ -5026,93 +5051,65 @@ void CGOpenMPRuntime::emitPrivateReduction(
}
}
};
- std::string CriticalName = getName({"reduction_critical"});
- emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
- } else {
- // Built-in Operator Combination
- const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts();
- if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr))
- ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
- const Expr *AssignRHS = nullptr;
- if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) {
- if (BinOp->getOpcode() == BO_Assign)
- AssignRHS = BinOp->getRHS();
- } else if (const auto *OpCall =
- dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
- if (OpCall->getOperator() == OO_Equal)
- AssignRHS = OpCall->getArg(1);
- }
- if (!AssignRHS)
- continue;
- const Expr *ReductionCombinerExpr = AssignRHS->IgnoreParenImpCasts();
- if (const auto *MTE =
- dyn_cast<MaterializeTemporaryExpr>(ReductionCombinerExpr))
- ReductionCombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts();
-
- BinaryOperatorKind BO = BO_Assign;
- RValue PrivateRV = CGF.EmitLoadOfLValue(LHSLV, Loc);
- if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionCombinerExpr)) {
- BO = BinOp->getOpcode();
- auto UpdateOp = [&](RValue OldVal) {
- if (BO == BO_Mul) {
- llvm::Value *OldScalar = OldVal.getScalarVal();
- llvm::Value *PrivateScalar = PrivateRV.getScalarVal();
- llvm::Value *Result =
- CGF.Builder.CreateMul(OldScalar, PrivateScalar);
- return RValue::get(Result);
- } else {
- OpaqueValueExpr OVE(BinOp->getLHS()->getExprLoc(),
- BinOp->getLHS()->getType(),
- ExprValueKind::VK_PRValue);
- CodeGenFunction::OpaqueValueMapping OldValMapping(CGF, &OVE,
- OldVal);
- return CGF.EmitAnyExpr(BinOp->getRHS());
- }
- };
+ EmitCriticalReduction(ReductionGen);
+ continue;
+ }
+ // Handle built-in reduction operations.
+ const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts();
+ if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr))
+ ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
- (void)CGF.EmitOMPAtomicSimpleUpdateExpr(
- SharedLV, PrivateRV, BO, true,
- llvm::AtomicOrdering::SequentiallyConsistent, Loc, UpdateOp);
- } else if (dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
- // Implicit Reduction Identifiers ( openmp 6.0 sec 7.6.5 )
- auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
- Action.Enter(CGF);
- const auto *OmpOutDRE =
- dyn_cast<DeclRefExpr>(LHSExprs[I]->IgnoreParenImpCasts());
- const auto *OmpInDRE =
- dyn_cast<DeclRefExpr>(RHSExprs[I]->IgnoreParenImpCasts());
-
- if (!OmpOutDRE || !OmpInDRE) {
- return;
- }
- const VarDecl *OmpOutVD = cast<VarDecl>(OmpOutDRE->getDecl());
- const VarDecl *OmpInVD = cast<VarDecl>(OmpInDRE->getDecl());
- CodeGenFunction::OMPPrivateScope LocalScope(CGF);
- LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress());
- LocalScope.addPrivate(OmpInVD, LHSLV.getAddress());
- (void)LocalScope.Privatize();
- CGF.EmitIgnoredExpr(ReductionOp);
- };
- std::string CriticalName = getName({"reduction_critical"});
- emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
- }
+ const Expr *AssignRHS = nullptr;
+ if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) {
+ if (BinOp->getOpcode() == BO_Assign)
+ AssignRHS = BinOp->getRHS();
+ } else if (const auto *OpCall =
+ dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
+ if (OpCall->getOperator() == OO_Equal)
+ AssignRHS = OpCall->getArg(1);
}
+
+ if (!AssignRHS)
+ continue;
+
+ const Expr *CombinerExpr = AssignRHS->IgnoreParenImpCasts();
+ if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(CombinerExpr))
+ CombinerExpr = MTE->getSubExpr()->IgnoreParenImpCasts();
+
+ auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ const auto *OmpOutDRE =
+ dyn_cast<DeclRefExpr>(LHSExprs[I]->IgnoreParenImpCasts());
+ const auto *OmpInDRE =
+ dyn_cast<DeclRefExpr>(RHSExprs[I]->IgnoreParenImpCasts());
+ if (!OmpOutDRE || !OmpInDRE)
+ return;
+ const VarDecl *OmpOutVD = cast<VarDecl>(OmpOutDRE->getDecl());
+ const VarDecl *OmpInVD = cast<VarDecl>(OmpInDRE->getDecl());
+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+ LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress());
+ LocalScope.addPrivate(OmpInVD, LHSLV.getAddress());
+ (void)LocalScope.Privatize();
+ // Emit the actual reduction operation
+ CGF.EmitIgnoredExpr(ReductionOp);
+ };
+ EmitCriticalReduction(ReductionGen);
}
- // Final barrier
+
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier),
BarrierArgs);
// Broadcast final result
bool IsAggregate = PrivateType->isAggregateType();
- llvm::Value *FinalResultVal = nullptr;
LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+ llvm::Value *FinalResultVal = nullptr;
Address FinalResultAddr = Address::invalid();
- if (IsAggregate) {
+
+ if (IsAggregate)
FinalResultAddr = SharedResult;
- } else {
+ else
FinalResultVal = CGF.EmitLoadOfScalar(SharedLV, Loc);
- }
for (unsigned I : llvm::seq<unsigned>(Privates.size())) {
LValue TargetLHSLV = CGF.EmitLValue(LHSExprs[I]);
@@ -5124,8 +5121,7 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV);
}
}
-
- // Final synchronization
+ // Final synchronization barrier
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier),
BarrierArgs);
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
index 313d0997f18e3..817aff37d6c0d 100644
--- a/clang/test/OpenMP/for_private_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -54,7 +54,6 @@ void do_red(int n, int *v, int &sum_v)
}
return 0;
}
-
//.
// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
@@ -454,8 +453,12 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: br label %[[INIT_END]]
// CHECK: [[INIT_END]]:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP7]], align 4
-// CHECK-NEXT: [[TMP30:%.*]] = atomicrmw add ptr @.omp.reduction..internal_private_var.1, i32 [[TMP29]] seq_cst, align 4
+// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4
+// CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
+// CHECK-NEXT: store i32 [[ADD12]], ptr @.omp.reduction..internal_private_var.1, align 4
+// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4
// CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4
@@ -532,24 +535,4 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 0
// CHECK-NEXT: call void @_Z6do_rediPiRi(i32 noundef 10, ptr noundef [[ARRAYDECAY]], ptr noundef nonnull align 4 dereferenceable(4) [[S_V]])
// CHECK-NEXT: ret void
-//
-//.
-// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-// CHECK: attributes #[[ATTR2]] = { noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #[[ATTR3]] = { noinline nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #[[ATTR4:[0-9]+]] = { nounwind }
-// CHECK: attributes #[[ATTR5]] = { noinline norecurse nounwind "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-// CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nounwind }
-// CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-// CHECK: attributes #[[ATTR8]] = { mustprogress noinline norecurse nounwind optnone "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
-//.
-// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-// CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 60}
-// CHECK: [[META2:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-// CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
-// CHECK: [[META4]] = !{!"llvm.loop.mustprogress"}
-// CHECK: [[META5:![0-9]+]] = !{[[META6:![0-9]+]]}
-// CHECK: [[META6]] = !{i64 2, i64 -1, i64 -1, i1 true}
-// CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
-//.
+
>From 9d2370ba2958d204a7de9cef499a2e567764f685 Mon Sep 17 00:00:00 2001
From: CHANDRA GHALE <chandra.nitdgp at gmail.com>
Date: Wed, 7 May 2025 17:44:09 +0530
Subject: [PATCH 11/18] Update OpenMPSupport.rst
---
clang/docs/OpenMPSupport.rst | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index 3c5011a8809d5..904b26cdb0b21 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -406,9 +406,8 @@ implementation.
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| Extensions to atomic construct | :none:`unclaimed` | :none:`unclaimed` | |
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
-| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938
-| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709
-|
+| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 |
+| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709 |
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
| Self maps | :part:`partial` | :none:`unclaimed` | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888 |
+-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
>From 0ca2f86613bd16fbd08b88bc1da5a5104b48e6d7 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Wed, 7 May 2025 11:46:16 -0500
Subject: [PATCH 12/18] Handle UDR init and updated lit
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 17 +-
.../OpenMP/for_private_reduction_codegen.cpp | 306 ++++++++++++++++--
2 files changed, 284 insertions(+), 39 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 982501f7214a9..c7a010b642c41 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4974,11 +4974,6 @@ void CGOpenMPRuntime::emitPrivateReduction(
llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId};
- // First barrier to ensure all threads are ready.
- CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
- CGM.getModule(), OMPRTL___kmpc_barrier),
- BarrierArgs);
- // Initialize the shared variable by the master thread.
llvm::BasicBlock *InitBB = CGF.createBasicBlock("init");
llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end");
@@ -4989,6 +4984,18 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGF.EmitBlock(InitBB);
auto EmitSharedInit = [&]() {
+ if (UDR) { // Check if it's a User-Defined Reduction
+ if (const Expr *UDRInitExpr = UDR->getInitializer()) {
+ // Use the initializer from the OMPDeclareReductionDecl
+ CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+ PrivateType.getQualifiers(), true);
+ } else {
+ // EmitNullInitialization handles default construction for C++ classes
+ // and zeroing for scalars, which is a reasonable default.
+ CGF.EmitNullInitialization(SharedResult, PrivateType);
+ }
+ return; // UDR initialization handled
+ }
if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
const Expr *InitExpr = VD->getInit();
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
index 817aff37d6c0d..31a52b611068c 100644
--- a/clang/test/OpenMP/for_private_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -4,19 +4,19 @@
#define N 10
class Sum {
int val;
+
public:
Sum(int v = 0) : val(v) {}
- Sum operator+(const Sum& rhs) const {
- return Sum(val + rhs.val);
- }
- Sum& operator+=(const Sum& rhs) {
+ Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); }
+ Sum &operator+=(const Sum &rhs) {
val += rhs.val;
return *this;
}
};
-#pragma omp declare reduction(sum_reduction : Sum : omp_out += omp_in) initializer(omp_priv = Sum(0))
+#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \
+ initializer(omp_priv = Sum(0))
-void func_red(){
+void func_red() {
Sum result(0);
Sum array[N];
@@ -24,36 +24,50 @@ void func_red(){
array[i] = Sum(i);
}
- #pragma omp parallel private(result) num_threads(4)
+#pragma omp parallel private(result) num_threads(4)
{
- #pragma omp for reduction(sum_reduction:result)
- for (int i = 0; i < N; i++) {
- result = result + array[i];
+#pragma omp for reduction(sum_reduction : result)
+ for (int i = 0; i < N; i++) {
+ result = result + array[i];
+ }
}
+}
+
+void do_red(int n, int *v, int &sum_v) {
+ sum_v = 0;
+#pragma omp for reduction(original(private), + : sum_v)
+ for (int i = 0; i < n; i++) {
+ sum_v += v[i];
}
}
+void do_red_extended(int n, int *v, int &sum_v, int &prod_v) {
+ sum_v = 0;
+ prod_v = 1;
-void do_red(int n, int *v, int &sum_v)
- {
- sum_v = 0;
- #pragma omp for reduction(original(private),+: sum_v)
- for (int i = 0; i < n; i++)
- {
- sum_v += v[i];
- }
- }
- int main(void)
- {
- int v[N];
- for (int i = 0; i < N; i++)
- v[i] = i;
- #pragma omp parallel num_threads(4)
- {
- int s_v;
- do_red(N, v, s_v);
- }
- return 0;
- }
+#pragma omp for reduction(original(private), + : sum_v) \
+ reduction(original(private), * : prod_v)
+ for (int i = 0; i < n; i++) {
+ sum_v += v[i];
+ prod_v *= v[i];
+ }
+}
+int main(void) {
+ int v[N];
+ for (int i = 0; i < N; i++)
+ v[i] = i;
+#pragma omp parallel num_threads(4)
+ {
+ int s_v;
+ do_red(N, v, s_v);
+ }
+
+ int sum_v_ext = 0, prod_v_ext = 1;
+#pragma omp parallel num_threads(4)
+ {
+ do_red_extended(N, v, sum_v_ext, prod_v_ext);
+ }
+ return 0;
+}
//.
// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
@@ -65,6 +79,7 @@ void do_red(int n, int *v, int &sum_v)
// CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8
// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
// CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4
+// CHECK: @.omp.reduction..internal_private_var.2 = common global i32 0, align 4
//.
// CHECK-LABEL: define dso_local void @_Z8func_redv(
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
@@ -214,11 +229,10 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
-// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
// CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0
// CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]]
// CHECK: [[INIT]]:
-// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var, i8 0, i64 4, i1 false)
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_private_var, i32 noundef 0)
// CHECK-NEXT: br label %[[INIT_END]]
// CHECK: [[INIT_END]]:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
@@ -445,7 +459,6 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
-// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0
// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]]
// CHECK: [[INIT]]:
@@ -489,12 +502,233 @@ void do_red(int n, int *v, int &sum_v)
// CHECK-NEXT: ret void
//
//
+// CHECK-LABEL: define dso_local void @_Z15do_red_extendediPiRiS0_(
+// CHECK-SAME: i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[PROD_V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[PROD_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[SUM_V5:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP6:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[PROD_V7:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP8:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[I9:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x ptr], align 8
+// CHECK-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[_TMP19:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[PROD_V]], ptr [[PROD_V_ADDR]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8
+// CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8
+// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT: store i32 0, ptr [[I]], align 4
+// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]]
+// CHECK-NEXT: br i1 [[CMP]], label %[[OMP_PRECOND_THEN:.*]], label %[[OMP_PRECOND_END:.*]]
+// CHECK: [[OMP_PRECOND_THEN]]:
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT: store i32 0, ptr [[SUM_V5]], align 4
+// CHECK-NEXT: store ptr [[SUM_V5]], ptr [[_TMP6]], align 8
+// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CHECK-NEXT: store i32 1, ptr [[PROD_V7]], align 4
+// CHECK-NEXT: store ptr [[PROD_V7]], ptr [[_TMP8]], align 8
+// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+// CHECK-NEXT: br i1 [[CMP10]], label %[[COND_TRUE:.*]], label %[[COND_FALSE:.*]]
+// CHECK: [[COND_TRUE]]:
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT: br label %[[COND_END:.*]]
+// CHECK: [[COND_FALSE]]:
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: br label %[[COND_END]]
+// CHECK: [[COND_END]]:
+// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], %[[COND_TRUE]] ], [ [[TMP14]], %[[COND_FALSE]] ]
+// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND:.*]]
+// CHECK: [[OMP_INNER_FOR_COND]]:
+// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
+// CHECK-NEXT: br i1 [[CMP11]], label %[[OMP_INNER_FOR_BODY:.*]], label %[[OMP_INNER_FOR_END:.*]]
+// CHECK: [[OMP_INNER_FOR_BODY]]:
+// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[I9]], align 4
+// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP20]] to i64
+// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[IDXPROM]]
+// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT: [[TMP22:%.*]] = load ptr, ptr [[_TMP6]], align 8
+// CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP23]], [[TMP21]]
+// CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP22]], align 4
+// CHECK-NEXT: [[TMP24:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP25]] to i64
+// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[IDXPROM13]]
+// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4
+// CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[_TMP8]], align 8
+// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[TMP28]], [[TMP26]]
+// CHECK-NEXT: store i32 [[MUL15]], ptr [[TMP27]], align 4
+// CHECK-NEXT: br label %[[OMP_BODY_CONTINUE:.*]]
+// CHECK: [[OMP_BODY_CONTINUE]]:
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_INC:.*]]
+// CHECK: [[OMP_INNER_FOR_INC]]:
+// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP29]], 1
+// CHECK-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT: br label %[[OMP_INNER_FOR_COND]]
+// CHECK: [[OMP_INNER_FOR_END]]:
+// CHECK-NEXT: br label %[[OMP_LOOP_EXIT:.*]]
+// CHECK: [[OMP_LOOP_EXIT]]:
+// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
+// CHECK-NEXT: store ptr [[SUM_V5]], ptr [[TMP30]], align 8
+// CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
+// CHECK-NEXT: store ptr [[PROD_V7]], ptr [[TMP31]], align 8
+// CHECK-NEXT: [[TMP32:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 2, i64 16, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: switch i32 [[TMP32]], [[DOTOMP_REDUCTION_DEFAULT:label %.*]] [
+// CHECK-NEXT: i32 1, [[DOTOMP_REDUCTION_CASE1:label %.*]]
+// CHECK-NEXT: i32 2, label %[[DOTOMP_REDUCTION_CASE2:.*]]
+// CHECK-NEXT: ]
+// CHECK: [[_OMP_REDUCTION_CASE1:.*:]]
+// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[SUM_V5]], align 4
+// CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK-NEXT: store i32 [[ADD17]], ptr [[TMP9]], align 4
+// CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[PROD_V7]], align 4
+// CHECK-NEXT: [[MUL18:%.*]] = mul nsw i32 [[TMP35]], [[TMP36]]
+// CHECK-NEXT: store i32 [[MUL18]], ptr [[TMP10]], align 4
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_CASE2:.*:]]
+// CHECK-NEXT: [[TMP37:%.*]] = load i32, ptr [[SUM_V5]], align 4
+// CHECK-NEXT: [[TMP38:%.*]] = atomicrmw add ptr [[TMP9]], i32 [[TMP37]] monotonic, align 4
+// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4
+// CHECK-NEXT: [[ATOMIC_LOAD:%.*]] = load atomic i32, ptr [[TMP10]] monotonic, align 4
+// CHECK-NEXT: br label %[[ATOMIC_CONT:.*]]
+// CHECK: [[ATOMIC_CONT]]:
+// CHECK-NEXT: [[TMP40:%.*]] = phi i32 [ [[ATOMIC_LOAD]], %[[DOTOMP_REDUCTION_CASE2]] ], [ [[TMP45:%.*]], %[[ATOMIC_CONT]] ]
+// CHECK-NEXT: store i32 [[TMP40]], ptr [[_TMP19]], align 4
+// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[_TMP19]], align 4
+// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[PROD_V7]], align 4
+// CHECK-NEXT: [[MUL20:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK-NEXT: store i32 [[MUL20]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT: [[TMP43:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT: [[TMP44:%.*]] = cmpxchg ptr [[TMP10]], i32 [[TMP40]], i32 [[TMP43]] monotonic monotonic, align 4
+// CHECK-NEXT: [[TMP45]] = extractvalue { i32, i1 } [[TMP44]], 0
+// CHECK-NEXT: [[TMP46:%.*]] = extractvalue { i32, i1 } [[TMP44]], 1
+// CHECK-NEXT: br i1 [[TMP46]], label %[[ATOMIC_EXIT:.*]], label %[[ATOMIC_CONT]]
+// CHECK: [[ATOMIC_EXIT]]:
+// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT: br [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK: [[_OMP_REDUCTION_DEFAULT:.*:]]
+// CHECK-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT: br i1 [[TMP47]], label %[[INIT:.*]], label %[[INIT_END:.*]]
+// CHECK: [[INIT]]:
+// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var.2, i8 0, i64 4, i1 false)
+// CHECK-NEXT: br label %[[INIT_END]]
+// CHECK: [[INIT_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4
+// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP48]], [[TMP49]]
+// CHECK-NEXT: store i32 [[ADD21]], ptr @.omp.reduction..internal_private_var.2, align 4
+// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4
+// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 [[TMP50]], [[TMP51]]
+// CHECK-NEXT: store i32 [[MUL22]], ptr @.omp.reduction..internal_private_var.2, align 4
+// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4
+// CHECK-NEXT: store i32 [[TMP52]], ptr [[TMP9]], align 4
+// CHECK-NEXT: store i32 [[TMP52]], ptr [[TMP10]], align 4
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: br label %[[OMP_PRECOND_END]]
+// CHECK: [[OMP_PRECOND_END]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
+// CHECK-NEXT: ret void
+//
+//
+// CHECK-LABEL: define internal void @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func(
+// CHECK-SAME: ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
+// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP4]], align 8
+// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP2]], i64 0, i64 0
+// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
+// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
+// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
+// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP2]], i64 0, i64 1
+// CHECK-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
+// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4
+// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
+// CHECK-NEXT: store i32 [[ADD]], ptr [[TMP7]], align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP11]], align 4
+// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
+// CHECK-NEXT: store i32 [[MUL]], ptr [[TMP11]], align 4
+// CHECK-NEXT: ret void
+//
+//
// CHECK-LABEL: define dso_local noundef i32 @main(
// CHECK-SAME: ) #[[ATTR8:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[SUM_V_EXT:%.*]] = alloca i32, align 4
+// CHECK-NEXT: [[PROD_V_EXT:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4
// CHECK-NEXT: store i32 0, ptr [[I]], align 4
@@ -518,6 +752,10 @@ void do_red(int n, int *v, int &sum_v)
// CHECK: [[FOR_END]]:
// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]])
+// CHECK-NEXT: store i32 0, ptr [[SUM_V_EXT]], align 4
+// CHECK-NEXT: store i32 1, ptr [[PROD_V_EXT]], align 4
+// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.3, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]])
// CHECK-NEXT: ret i32 0
//
//
>From 9335af1858c8befc1796253b1ed05aacf62109a9 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Thu, 8 May 2025 08:07:20 -0500
Subject: [PATCH 13/18] multiple reduced value change
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 144 +++++++++---------
clang/lib/CodeGen/CGOpenMPRuntime.h | 6 +-
.../OpenMP/for_private_reduction_codegen.cpp | 54 ++++---
3 files changed, 110 insertions(+), 94 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index c7a010b642c41..9d13397ee9fb5 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4900,15 +4900,8 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF,
}
void CGOpenMPRuntime::emitPrivateReduction(
- CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
- ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
- ArrayRef<const Expr *> ReductionOps) {
- if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty())
- return;
-
- if (LHSExprs.size() != Privates.size() ||
- LHSExprs.size() != ReductionOps.size())
- return;
+ CodeGenFunction &CGF, SourceLocation Loc, const Expr *Privates,
+ const Expr *LHSExprs, const Expr *RHSExprs, const Expr *ReductionOps) {
// Create a shared global variable (__shared_reduction_var) to accumulate the
// final result.
@@ -4931,15 +4924,15 @@ void CGOpenMPRuntime::emitPrivateReduction(
// Each thread copies __shared_reduction_var[i] back to LHSExprs[i].
//
// Final __kmpc_barrier to synchronize after broadcasting
- QualType PrivateType = Privates[0]->getType();
+ QualType PrivateType = Privates->getType();
llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
llvm::Constant *InitVal = nullptr;
- const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps[0]);
+ const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps);
// Determine the initial value for the shared reduction variable
if (!UDR) {
InitVal = llvm::Constant::getNullValue(LLVMType);
- if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates)) {
if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
const Expr *InitExpr = VD->getInit();
if (InitExpr && !PrivateType->isAggregateType() &&
@@ -4956,11 +4949,18 @@ void CGOpenMPRuntime::emitPrivateReduction(
} else {
InitVal = llvm::Constant::getNullValue(LLVMType);
}
+ std::string ReductionVarNameStr;
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates->IgnoreParenCasts())) {
+ ReductionVarNameStr = DRE->getDecl()->getNameAsString();
+ } else {
+ ReductionVarNameStr = "unnamed_priv_var";
+ }
// Create an internal shared variable
- std::string SharedName = getName({"internal_private_var"});
+ std::string SharedName =
+ CGM.getOpenMPRuntime().getName({"internal_pivate_", ReductionVarNameStr});
llvm::GlobalVariable *SharedVar = new llvm::GlobalVariable(
- CGM.getModule(), LLVMType, false, llvm::GlobalValue::CommonLinkage,
+ CGM.getModule(), LLVMType, false, llvm::GlobalValue::InternalLinkage,
InitVal, ".omp.reduction." + SharedName, nullptr,
llvm::GlobalVariable::NotThreadLocal);
@@ -4996,7 +4996,7 @@ void CGOpenMPRuntime::emitPrivateReduction(
}
return; // UDR initialization handled
}
- if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates[0])) {
+ if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates)) {
if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
const Expr *InitExpr = VD->getInit();
if (InitExpr && (PrivateType->isAggregateType() ||
@@ -5021,47 +5021,45 @@ void CGOpenMPRuntime::emitPrivateReduction(
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_barrier),
BarrierArgs);
- for (unsigned I :
- llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) {
- const Expr *ReductionOp = ReductionOps[I];
- const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp);
- LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
- LValue LHSLV = CGF.EmitLValue(LHSExprs[I]);
+ const Expr *ReductionOp = ReductionOps;
+ const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp);
+ LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+ LValue LHSLV = CGF.EmitLValue(LHSExprs);
- auto EmitCriticalReduction = [&](auto ReductionGen) {
- std::string CriticalName = getName({"reduction_critical"});
- emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
- };
+ auto EmitCriticalReduction = [&](auto ReductionGen) {
+ std::string CriticalName = getName({"reduction_critical"});
+ emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
+ };
- if (CurrentUDR) {
- // Handle user-defined reduction.
- auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
- Action.Enter(CGF);
- std::pair<llvm::Function *, llvm::Function *> FnPair =
- getUserDefinedReduction(CurrentUDR);
- if (FnPair.first) {
- if (const auto *CE = dyn_cast<CallExpr>(ReductionOp)) {
- const auto *OutDRE = cast<DeclRefExpr>(
- cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
- ->getSubExpr());
- const auto *InDRE = cast<DeclRefExpr>(
- cast<UnaryOperator>(CE->getArg(1)->IgnoreParenImpCasts())
- ->getSubExpr());
- CodeGenFunction::OMPPrivateScope LocalScope(CGF);
- LocalScope.addPrivate(cast<VarDecl>(OutDRE->getDecl()),
- SharedLV.getAddress());
- LocalScope.addPrivate(cast<VarDecl>(InDRE->getDecl()),
- LHSLV.getAddress());
- (void)LocalScope.Privatize();
- emitReductionCombiner(CGF, ReductionOp);
- }
+ if (CurrentUDR) {
+ // Handle user-defined reduction.
+ auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+ Action.Enter(CGF);
+ std::pair<llvm::Function *, llvm::Function *> FnPair =
+ getUserDefinedReduction(CurrentUDR);
+ if (FnPair.first) {
+ if (const auto *CE = dyn_cast<CallExpr>(ReductionOp)) {
+ const auto *OutDRE = cast<DeclRefExpr>(
+ cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
+ ->getSubExpr());
+ const auto *InDRE = cast<DeclRefExpr>(
+ cast<UnaryOperator>(CE->getArg(1)->IgnoreParenImpCasts())
+ ->getSubExpr());
+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+ LocalScope.addPrivate(cast<VarDecl>(OutDRE->getDecl()),
+ SharedLV.getAddress());
+ LocalScope.addPrivate(cast<VarDecl>(InDRE->getDecl()),
+ LHSLV.getAddress());
+ (void)LocalScope.Privatize();
+ emitReductionCombiner(CGF, ReductionOp);
}
- };
- EmitCriticalReduction(ReductionGen);
- continue;
- }
- // Handle built-in reduction operations.
+ }
+ };
+ EmitCriticalReduction(ReductionGen);
+ }
+ // Handle built-in reduction operations.
+ else {
const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts();
if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr))
ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
@@ -5077,7 +5075,7 @@ void CGOpenMPRuntime::emitPrivateReduction(
}
if (!AssignRHS)
- continue;
+ return;
const Expr *CombinerExpr = AssignRHS->IgnoreParenImpCasts();
if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(CombinerExpr))
@@ -5086,9 +5084,9 @@ void CGOpenMPRuntime::emitPrivateReduction(
auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
Action.Enter(CGF);
const auto *OmpOutDRE =
- dyn_cast<DeclRefExpr>(LHSExprs[I]->IgnoreParenImpCasts());
+ dyn_cast<DeclRefExpr>(LHSExprs->IgnoreParenImpCasts());
const auto *OmpInDRE =
- dyn_cast<DeclRefExpr>(RHSExprs[I]->IgnoreParenImpCasts());
+ dyn_cast<DeclRefExpr>(RHSExprs->IgnoreParenImpCasts());
if (!OmpOutDRE || !OmpInDRE)
return;
const VarDecl *OmpOutVD = cast<VarDecl>(OmpOutDRE->getDecl());
@@ -5109,24 +5107,22 @@ void CGOpenMPRuntime::emitPrivateReduction(
// Broadcast final result
bool IsAggregate = PrivateType->isAggregateType();
- LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+ LValue SharedLV1 = CGF.MakeAddrLValue(SharedResult, PrivateType);
llvm::Value *FinalResultVal = nullptr;
Address FinalResultAddr = Address::invalid();
if (IsAggregate)
FinalResultAddr = SharedResult;
else
- FinalResultVal = CGF.EmitLoadOfScalar(SharedLV, Loc);
-
- for (unsigned I : llvm::seq<unsigned>(Privates.size())) {
- LValue TargetLHSLV = CGF.EmitLValue(LHSExprs[I]);
- if (IsAggregate) {
- CGF.EmitAggregateCopy(TargetLHSLV,
- CGF.MakeAddrLValue(FinalResultAddr, PrivateType),
- PrivateType, AggValueSlot::DoesNotOverlap, false);
- } else {
- CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV);
- }
+ FinalResultVal = CGF.EmitLoadOfScalar(SharedLV1, Loc);
+
+ LValue TargetLHSLV = CGF.EmitLValue(LHSExprs);
+ if (IsAggregate) {
+ CGF.EmitAggregateCopy(TargetLHSLV,
+ CGF.MakeAddrLValue(FinalResultAddr, PrivateType),
+ PrivateType, AggValueSlot::DoesNotOverlap, false);
+ } else {
+ CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV);
}
// Final synchronization barrier
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
@@ -5436,8 +5432,18 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
CGF.EmitBranch(DefaultBB);
CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
- if (Options.IsPrivateVarReduction)
- emitPrivateReduction(CGF, Loc, Privates, LHSExprs, RHSExprs, ReductionOps);
+ if (Options.IsPrivateVarReduction) {
+ if (LHSExprs.empty() || Privates.empty() || ReductionOps.empty())
+ return;
+ if (LHSExprs.size() != Privates.size() ||
+ LHSExprs.size() != ReductionOps.size())
+ return;
+ for (unsigned I :
+ llvm::seq<unsigned>(std::min(ReductionOps.size(), LHSExprs.size()))) {
+ emitPrivateReduction(CGF, Loc, Privates[I], LHSExprs[I], RHSExprs[I],
+ ReductionOps[I]);
+ }
+ }
}
/// Generates unique name for artificial threadprivate variables.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 50ba28b565b6d..51595cdb31fa8 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1212,10 +1212,8 @@ class CGOpenMPRuntime {
/// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
/// or 'operator binop(LHS, RHS)'.
void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc,
- ArrayRef<const Expr *> Privates,
- ArrayRef<const Expr *> LHSExprs,
- ArrayRef<const Expr *> RHSExprs,
- ArrayRef<const Expr *> ReductionOps);
+ const Expr *Privates, const Expr *LHSExprs,
+ const Expr *RHSExprs, const Expr *ReductionOps);
/// Emit a code for reduction clause. Next code should be emitted for
/// reduction:
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
index 31a52b611068c..8532bf4125787 100644
--- a/clang/test/OpenMP/for_private_reduction_codegen.cpp
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -68,6 +68,7 @@ int main(void) {
}
return 0;
}
+
//.
// CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
// CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
@@ -75,11 +76,12 @@ int main(void) {
// CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 18, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
// CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
// CHECK: @.gomp_critical_user_.atomic_reduction.var = common global [8 x i32] zeroinitializer, align 8
-// CHECK: @.omp.reduction..internal_private_var = common global %class.Sum zeroinitializer, align 4
+// CHECK: @.omp.reduction..internal_pivate_.result = internal global %class.Sum zeroinitializer, align 4
// CHECK: @.gomp_critical_user_.reduction_critical.var = common global [8 x i32] zeroinitializer, align 8
// CHECK: @[[GLOB4:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 66, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
-// CHECK: @.omp.reduction..internal_private_var.1 = common global i32 0, align 4
-// CHECK: @.omp.reduction..internal_private_var.2 = common global i32 0, align 4
+// CHECK: @.omp.reduction..internal_pivate_.sum_v = internal global i32 0, align 4
+// CHECK: @.omp.reduction..internal_pivate_.sum_v.1 = internal global i32 0, align 4
+// CHECK: @.omp.reduction..internal_pivate_.prod_v = internal global i32 1, align 4
//.
// CHECK-LABEL: define dso_local void @_Z8func_redv(
// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
@@ -232,15 +234,15 @@ int main(void) {
// CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP2]], 0
// CHECK-NEXT: br i1 [[TMP13]], label %[[INIT:.*]], label %[[INIT_END:.*]]
// CHECK: [[INIT]]:
-// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_private_var, i32 noundef 0)
+// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result, i32 noundef 0)
// CHECK-NEXT: br label %[[INIT_END]]
// CHECK: [[INIT_END]]:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
-// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_private_var, ptr noundef [[RESULT]])
+// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result, ptr noundef [[RESULT]])
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
-// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_private_var, align 4
+// CHECK-NEXT: [[TMP14:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result, align 4
// CHECK-NEXT: store [[CLASS_SUM]] [[TMP14]], ptr [[RESULT]], align 4
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
@@ -462,18 +464,18 @@ int main(void) {
// CHECK-NEXT: [[TMP28:%.*]] = icmp eq i32 [[TMP0]], 0
// CHECK-NEXT: br i1 [[TMP28]], label %[[INIT:.*]], label %[[INIT_END:.*]]
// CHECK: [[INIT]]:
-// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var.1, i8 0, i64 4, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_pivate_.sum_v, i8 0, i64 4, i1 false)
// CHECK-NEXT: br label %[[INIT_END]]
// CHECK: [[INIT_END]]:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
-// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4
+// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4
// CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
-// CHECK-NEXT: store i32 [[ADD12]], ptr @.omp.reduction..internal_private_var.1, align 4
+// CHECK-NEXT: store i32 [[ADD12]], ptr @.omp.reduction..internal_pivate_.sum_v, align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.1, align 4
+// CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v, align 4
// CHECK-NEXT: store i32 [[TMP31]], ptr [[TMP7]], align 4
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: br label %[[OMP_PRECOND_END]]
@@ -666,26 +668,36 @@ int main(void) {
// CHECK-NEXT: [[TMP47:%.*]] = icmp eq i32 [[TMP0]], 0
// CHECK-NEXT: br i1 [[TMP47]], label %[[INIT:.*]], label %[[INIT_END:.*]]
// CHECK: [[INIT]]:
-// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_private_var.2, i8 0, i64 4, i1 false)
+// CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 @.omp.reduction..internal_pivate_.sum_v.1, i8 0, i64 4, i1 false)
// CHECK-NEXT: br label %[[INIT_END]]
// CHECK: [[INIT_END]]:
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
-// CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4
+// CHECK-NEXT: [[TMP48:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.1, align 4
// CHECK-NEXT: [[TMP49:%.*]] = load i32, ptr [[TMP9]], align 4
// CHECK-NEXT: [[ADD21:%.*]] = add nsw i32 [[TMP48]], [[TMP49]]
-// CHECK-NEXT: store i32 [[ADD21]], ptr @.omp.reduction..internal_private_var.2, align 4
+// CHECK-NEXT: store i32 [[ADD21]], ptr @.omp.reduction..internal_pivate_.sum_v.1, align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.1, align 4
+// CHECK-NEXT: store i32 [[TMP50]], ptr [[TMP9]], align 4
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT: [[TMP51:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT: br i1 [[TMP51]], label %[[INIT22:.*]], label %[[INIT_END23:.*]]
+// CHECK: [[INIT22]]:
+// CHECK-NEXT: store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v, align 4
+// CHECK-NEXT: br label %[[INIT_END23]]
+// CHECK: [[INIT_END23]]:
+// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
-// CHECK-NEXT: [[TMP50:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4
-// CHECK-NEXT: [[TMP51:%.*]] = load i32, ptr [[TMP10]], align 4
-// CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 [[TMP50]], [[TMP51]]
-// CHECK-NEXT: store i32 [[MUL22]], ptr @.omp.reduction..internal_private_var.2, align 4
+// CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4
+// CHECK-NEXT: [[TMP53:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK-NEXT: [[MUL24:%.*]] = mul nsw i32 [[TMP52]], [[TMP53]]
+// CHECK-NEXT: store i32 [[MUL24]], ptr @.omp.reduction..internal_pivate_.prod_v, align 4
// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
-// CHECK-NEXT: [[TMP52:%.*]] = load i32, ptr @.omp.reduction..internal_private_var.2, align 4
-// CHECK-NEXT: store i32 [[TMP52]], ptr [[TMP9]], align 4
-// CHECK-NEXT: store i32 [[TMP52]], ptr [[TMP10]], align 4
+// CHECK-NEXT: [[TMP54:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v, align 4
+// CHECK-NEXT: store i32 [[TMP54]], ptr [[TMP10]], align 4
// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
// CHECK-NEXT: br label %[[OMP_PRECOND_END]]
// CHECK: [[OMP_PRECOND_END]]:
@@ -755,7 +767,7 @@ int main(void) {
// CHECK-NEXT: store i32 0, ptr [[SUM_V_EXT]], align 4
// CHECK-NEXT: store i32 1, ptr [[PROD_V_EXT]], align 4
// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
-// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.3, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]])
+// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.2, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]])
// CHECK-NEXT: ret i32 0
//
//
>From e1a19983c9beeb11b045794025bf5e36761184dc Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Thu, 8 May 2025 13:39:36 -0500
Subject: [PATCH 14/18] UDR init logic leveraged from
emitInitWithReductionInitializer fn
---
clang/lib/CodeGen/CGOpenMPRuntime.cpp | 35 ++++++++++++++++++++++++---
1 file changed, 32 insertions(+), 3 deletions(-)
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 9d13397ee9fb5..b87583049b253 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4986,9 +4986,38 @@ void CGOpenMPRuntime::emitPrivateReduction(
auto EmitSharedInit = [&]() {
if (UDR) { // Check if it's a User-Defined Reduction
if (const Expr *UDRInitExpr = UDR->getInitializer()) {
- // Use the initializer from the OMPDeclareReductionDecl
- CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
- PrivateType.getQualifiers(), true);
+ std::pair<llvm::Function *, llvm::Function *> FnPair =
+ getUserDefinedReduction(UDR);
+ llvm::Function *InitializerFn = FnPair.second;
+ if (InitializerFn) {
+ if (const auto *CE =
+ dyn_cast<CallExpr>(UDRInitExpr->IgnoreParenImpCasts())) {
+ const auto *OutDRE = cast<DeclRefExpr>(
+ cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
+ ->getSubExpr());
+ const VarDecl *OutVD = cast<VarDecl>(OutDRE->getDecl());
+
+ CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+ LocalScope.addPrivate(OutVD, SharedResult);
+
+ (void)LocalScope.Privatize();
+ if (const auto *OVE = dyn_cast<OpaqueValueExpr>(
+ CE->getCallee()->IgnoreParenImpCasts())) {
+ CodeGenFunction::OpaqueValueMapping OpaqueMap(
+ CGF, OVE, RValue::get(InitializerFn));
+ CGF.EmitIgnoredExpr(CE);
+ } else {
+ CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+ PrivateType.getQualifiers(), true);
+ }
+ } else {
+ CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+ PrivateType.getQualifiers(), true);
+ }
+ } else {
+ CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+ PrivateType.getQualifiers(), true);
+ }
} else {
// EmitNullInitialization handles default construction for C++ classes
// and zeroing for scalars, which is a reasonable default.
>From efd69bb41b827690f46485d24194e5b1b6f98bb9 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Fri, 9 May 2025 05:20:58 -0500
Subject: [PATCH 15/18] runtime tests
---
.../for/omp_for_private_reduction.cpp | 93 +++++++++++++++++++
1 file changed, 93 insertions(+)
create mode 100644 openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
new file mode 100644
index 0000000000000..0a3bbafd9331f
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
@@ -0,0 +1,93 @@
+//RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run
+#include <stdio.h>
+#include <omp.h>
+#include "omp_testsuite.h"
+
+#define N 10
+class Sum {
+ int val;
+
+public:
+ Sum(int v = 0) : val(v) {}
+ Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); }
+ Sum &operator+=(const Sum &rhs) {
+ val += rhs.val;
+ return *this;
+ }
+ int getValue() const { return val; }
+};
+
+// Declare OpenMP reduction
+#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \
+ initializer(omp_priv = Sum(0))
+
+int checkUserDefinedReduction() {
+ Sum final_result_udr(0);
+ Sum array_sum[N];
+ int error_flag = 0;
+ int expected_value = 0;
+ for (int i = 0; i < N; ++i) {
+ array_sum[i] = Sum(i);
+ expected_value += i; // Calculate expected sum: 0 + 1 + ... + (N-1)
+ }
+#pragma omp parallel num_threads(4)
+ {
+#pragma omp for reduction(sum_reduction : final_result_udr)
+ for (int i = 0; i < N; ++i) {
+ final_result_udr += array_sum[i];
+ }
+
+ if (final_result_udr.getValue() != expected_value)
+ error_flag += 1;
+ }
+ return error_flag;
+}
+
+void performReductions(int n_elements, const int *input_values,
+ int &sum_val_out, int &prod_val_out,
+ float &float_sum_val_out) {
+ // private variables for this thread's reduction.
+ sum_val_out = 0;
+ prod_val_out = 1;
+ float_sum_val_out = 0.0f;
+
+ const float kPiValue = 3.14f;
+#pragma omp for reduction(original(private), + : sum_val_out) \
+ reduction(original(private), * : prod_val_out) \
+ reduction(original(private), + : float_sum_val_out)
+ for (int i = 0; i < n_elements; ++i) {
+ sum_val_out += input_values[i];
+ prod_val_out *= (i + 1);
+ float_sum_val_out += kPiValue;
+ }
+}
+int main(void) {
+ int input_array[N];
+ int total_errors = 0;
+ const float kPiVal = 3.14f;
+ const int kExpectedSum = 45; // Sum of 0..9
+ const int kExpectedProd = 3628800; // 10!
+ const float kExpectedFsum = kPiVal * N; // 3.14f * 10
+
+ for (int i = 0; i < N; i++)
+ input_array[i] = i;
+#pragma omp parallel num_threads(4)
+ {
+
+ int t_sum_v;
+ int t_prod_v;
+ float t_fsum_v;
+ performReductions(N, input_array, t_sum_v, t_prod_v, t_fsum_v);
+ if (t_sum_v != kExpectedSum)
+ total_errors++;
+ if (t_prod_v != kExpectedProd)
+ total_errors++;
+ if (t_fsum_v != kExpectedFsum)
+ total_errors++;
+ }
+ total_errors += checkUserDefinedReduction();
+ if (total_errors != 0)
+ fprintf(stderr, "ERROR: reduction on private variable %d\n", total_errors);
+
+ return total_errors;
+}
>From c01671e8f164c59d1ff625f7610e8bcc0b341519 Mon Sep 17 00:00:00 2001
From: CHANDRA GHALE <chandra.nitdgp at gmail.com>
Date: Fri, 9 May 2025 15:56:23 +0530
Subject: [PATCH 16/18] Update omp_for_private_reduction.cpp
---
.../runtime/test/worksharing/for/omp_for_private_reduction.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
index 0a3bbafd9331f..799955266cd67 100644
--- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
+++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
@@ -30,7 +30,7 @@ int checkUserDefinedReduction() {
array_sum[i] = Sum(i);
expected_value += i; // Calculate expected sum: 0 + 1 + ... + (N-1)
}
-#pragma omp parallel num_threads(4)
+#pragma omp parallel num_threads(4) private(final_result_udr)
{
#pragma omp for reduction(sum_reduction : final_result_udr)
for (int i = 0; i < N; ++i) {
>From ad0d2f0d43a0783491d804b14f2d9af69dae1493 Mon Sep 17 00:00:00 2001
From: CHANDRA GHALE <chandra.nitdgp at gmail.com>
Date: Fri, 9 May 2025 16:02:19 +0530
Subject: [PATCH 17/18] Update omp_for_private_reduction.cpp
Formating
---
.../test/worksharing/for/omp_for_private_reduction.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
index 799955266cd67..faf85a82b6232 100644
--- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
+++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
@@ -1,4 +1,4 @@
-//RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run
+// RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run
#include <stdio.h>
#include <omp.h>
#include "omp_testsuite.h"
@@ -65,8 +65,8 @@ int main(void) {
int input_array[N];
int total_errors = 0;
const float kPiVal = 3.14f;
- const int kExpectedSum = 45; // Sum of 0..9
- const int kExpectedProd = 3628800; // 10!
+ const int kExpectedSum = 45; // Sum of 0..9
+ const int kExpectedProd = 3628800; // 10!
const float kExpectedFsum = kPiVal * N; // 3.14f * 10
for (int i = 0; i < N; i++)
>From 4df291081a6d0ffe1611f8717299d983838d5768 Mon Sep 17 00:00:00 2001
From: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
Date: Fri, 9 May 2025 06:35:57 -0500
Subject: [PATCH 18/18] update test
---
.../for/omp_for_private_reduction.cpp | 28 ++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)
diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
index faf85a82b6232..f907719cf5f34 100644
--- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
+++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
@@ -1,6 +1,7 @@
// RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run
#include <stdio.h>
#include <omp.h>
+#include <limits.h>
#include "omp_testsuite.h"
#define N 10
@@ -42,7 +43,20 @@ int checkUserDefinedReduction() {
}
return error_flag;
}
-
+void performMinMaxRed(int &min_val, int &max_val) {
+ int input_data[] = {7, 3, 12, 5, 8};
+ int n_size = sizeof(input_data) / sizeof(input_data[0]);
+ min_val = INT_MAX;
+ max_val = INT_MIN;
+#pragma omp for reduction(original(private), min : min_val) \
+ reduction(original(private), max : max_val)
+ for (int i = 0; i < n_size; ++i) {
+ if (input_data[i] < min_val)
+ min_val = input_data[i];
+ if (input_data[i] > max_val)
+ max_val = input_data[i];
+ }
+}
void performReductions(int n_elements, const int *input_values,
int &sum_val_out, int &prod_val_out,
float &float_sum_val_out) {
@@ -68,6 +82,8 @@ int main(void) {
const int kExpectedSum = 45; // Sum of 0..9
const int kExpectedProd = 3628800; // 10!
const float kExpectedFsum = kPiVal * N; // 3.14f * 10
+ const int kExpectedMin = 3;
+ const int kExpectedMax = 12;
for (int i = 0; i < N; i++)
input_array[i] = i;
@@ -85,6 +101,16 @@ int main(void) {
if (t_fsum_v != kExpectedFsum)
total_errors++;
}
+#pragma omp parallel num_threads(4)
+ {
+ int t_min_v;
+ int t_max_v;
+ performMinMaxRed(t_min_v, t_max_v);
+ if (t_min_v != kExpectedMin)
+ total_errors++;
+ if (t_max_v != kExpectedMax)
+ total_errors++;
+ }
total_errors += checkUserDefinedReduction();
if (total_errors != 0)
fprintf(stderr, "ERROR: reduction on private variable %d\n", total_errors);
More information about the Openmp-commits
mailing list