[Openmp-commits] [clang] [llvm] [mlir] [openmp] Reapply "[OpenMP][offload] Cross-team reductions with variable number of teams" (#204914) (PR #205071)
via Openmp-commits
openmp-commits at lists.llvm.org
Mon Jun 22 02:26:17 PDT 2026
llvmorg-github-actions[bot] wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-mlir-openmp
Author: Robert Imschweiler (ro-i)
<details>
<summary>Changes</summary>
Together with the fix for https://github.com/llvm/llvm-project/pull/195102#issuecomment-4756584289
---
Patch is 1.41 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/205071.diff
173 Files Affected:
- (modified) clang/include/clang/Basic/LangOptions.def (-1)
- (modified) clang/include/clang/Options/Options.td (+5-1)
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+5-7)
- (modified) clang/lib/Driver/ToolChains/Clang.cpp (+11-2)
- (modified) clang/lib/Frontend/CompilerInvocation.cpp (-7)
- (modified) clang/test/Driver/openmp-offload-gpu.c (+2-2)
- (modified) clang/test/OpenMP/bug60602.cpp (+2-2)
- (modified) clang/test/OpenMP/declare_target_local_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/distribute_codegen.cpp (+10-10)
- (modified) clang/test/OpenMP/distribute_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_parallel_for_codegen.cpp (+28-28)
- (modified) clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp (+8-8)
- (modified) clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp (+24-24)
- (modified) clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp (+28-28)
- (modified) clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp (+32-32)
- (modified) clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp (+24-24)
- (modified) clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/distribute_private_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/distribute_simd_codegen.cpp (+20-20)
- (modified) clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/distribute_simd_private_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/distribute_simd_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/map_struct_ordering.cpp (+1-1)
- (modified) clang/test/OpenMP/nvptx_lambda_capturing.cpp (+5-5)
- (modified) clang/test/OpenMP/reduction_implicit_map.cpp (+6-6)
- (modified) clang/test/OpenMP/spirv_target_teams_reduction_addrspace.c (+1-1)
- (modified) clang/test/OpenMP/target_codegen_global_capture.cpp (+6-6)
- (modified) clang/test/OpenMP/target_default_codegen.cpp (+16-16)
- (modified) clang/test/OpenMP/target_defaultmap_codegen_03.cpp (+8-8)
- (modified) clang/test/OpenMP/target_dyn_groupprivate_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/target_firstprivate_codegen.cpp (+24-24)
- (modified) clang/test/OpenMP/target_has_device_addr_codegen.cpp (+15-15)
- (modified) clang/test/OpenMP/target_has_device_addr_codegen_01.cpp (+2-2)
- (modified) clang/test/OpenMP/target_is_device_ptr_codegen.cpp (+44-44)
- (modified) clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp (+1-1)
- (modified) clang/test/OpenMP/target_map_array_section_no_length_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_codegen.cpp (+1-1)
- (modified) clang/test/OpenMP/target_map_codegen_03.cpp (+2-2)
- (modified) clang/test/OpenMP/target_map_codegen_hold.cpp (+12-12)
- (modified) clang/test/OpenMP/target_map_deref_array_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/target_map_member_expr_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/target_offload_mandatory_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/target_parallel_codegen.cpp (+14-14)
- (modified) clang/test/OpenMP/target_parallel_for_codegen.cpp (+28-28)
- (modified) clang/test/OpenMP/target_parallel_for_simd_codegen.cpp (+28-28)
- (modified) clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp (+12-12)
- (modified) clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp (+2-2)
- (modified) clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp (+1-1)
- (modified) clang/test/OpenMP/target_parallel_if_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/target_parallel_num_threads_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/target_parallel_num_threads_strict_codegen.cpp (+8-8)
- (modified) clang/test/OpenMP/target_task_affinity_codegen.cpp (+2-2)
- (modified) clang/test/OpenMP/target_teams_codegen.cpp (+26-26)
- (modified) clang/test/OpenMP/target_teams_distribute_codegen.cpp (+14-14)
- (modified) clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp (+1-1)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp (+60-60)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_if_codegen.cpp (+24-24)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp (+60-60)
- (modified) clang/test/OpenMP/target_teams_distribute_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp (+40-40)
- (modified) clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp (+28-28)
- (modified) clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_generic_loop_codegen.cpp (+1-27)
- (modified) clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp (+5-5)
- (modified) clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp (+1-1)
- (modified) clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp (+1-1)
- (modified) clang/test/OpenMP/target_teams_map_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/target_teams_num_teams_codegen.cpp (+12-12)
- (renamed) clang/test/OpenMP/target_teams_reduction_codegen.cpp (+89-1400)
- (modified) clang/test/OpenMP/target_teams_thread_limit_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/teams_codegen.cpp (+20-20)
- (modified) clang/test/OpenMP/teams_distribute_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/teams_distribute_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/teams_distribute_dist_schedule_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_copyin_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_dist_schedule_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_if_codegen.cpp (+8-8)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_num_threads_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_proc_bind_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_schedule_codegen.cpp (+60-60)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_if_codegen.cpp (+32-32)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_num_threads_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_proc_bind_codegen.cpp (+3-3)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_parallel_for_simd_schedule_codegen.cpp (+60-60)
- (modified) clang/test/OpenMP/teams_distribute_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_simd_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/teams_distribute_simd_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/teams_distribute_simd_dist_schedule_codegen.cpp (+18-18)
- (modified) clang/test/OpenMP/teams_distribute_simd_firstprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_simd_lastprivate_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_simd_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_distribute_simd_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_firstprivate_codegen.cpp (+12-12)
- (modified) clang/test/OpenMP/teams_generic_loop_codegen-1.cpp (+12-12)
- (modified) clang/test/OpenMP/teams_generic_loop_collapse_codegen.cpp (+6-6)
- (modified) clang/test/OpenMP/teams_generic_loop_private_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_generic_loop_reduction_codegen.cpp (+4-4)
- (modified) clang/test/OpenMP/teams_private_codegen.cpp (+10-10)
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPConstants.h (+1-1)
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+31-11)
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPKinds.def (+6-8)
- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+153-74)
- (modified) llvm/lib/Transforms/IPO/OpenMPOpt.cpp (+1-1)
- (modified) llvm/test/Transforms/OpenMP/add_attributes.ll (+4-4)
- (modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (-4)
- (modified) mlir/test/Target/LLVMIR/allocatable_gpu_reduction_teams.mlir (+4-12)
- (modified) mlir/test/Target/LLVMIR/omptarget-multi-reduction.mlir (+3-2)
- (modified) mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir (+1-1)
- (modified) mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction-array-descriptor.mlir (+10-15)
- (modified) mlir/test/Target/LLVMIR/omptarget-teams-distribute-reduction.mlir (+2-1)
- (modified) mlir/test/Target/LLVMIR/omptarget-teams-reduction.mlir (+2-1)
- (modified) mlir/test/Target/LLVMIR/openmp-target-launch-device.mlir (+2-2)
- (modified) offload/include/Shared/Environment.h (+1-3)
- (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+7-5)
- (modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+16-10)
- (modified) openmp/device/include/Interface.h (+6-7)
- (modified) openmp/device/src/Reduction.cpp (+154-168)
``````````diff
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 319fd18cddb36..d68784b7efbcd 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -231,7 +231,6 @@ LANGOPT(OpenMPCUDAMode , 1, 0, NotCompatible, "Generate code for OpenMP pragm
LANGOPT(OpenMPIRBuilder , 1, 0, NotCompatible, "Use the experimental OpenMP-IR-Builder codegen path.")
LANGOPT(OpenMPCUDANumSMs , 32, 0, NotCompatible, "Number of SMs for CUDA devices.")
LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, NotCompatible, "Number of blocks per SM for CUDA devices.")
-LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, NotCompatible, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.")
LANGOPT(OpenMPTargetDebug , 32, 0, NotCompatible, "Enable debugging in the OpenMP offloading device RTL")
LANGOPT(OpenMPOptimisticCollapse , 1, 0, NotCompatible, "Use at most 32 bits to represent the collapsed loop nest counter.")
LANGOPT(OpenMPThreadSubscription , 1, 0, NotCompatible, "Assume work-shared loops do not have more iterations than participating threads.")
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index e4a9d95ece0ab..377897a15f746 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -4120,7 +4120,11 @@ def fopenmp_cuda_number_of_sm_EQ : Joined<["-"], "fopenmp-cuda-number-of-sm=">,
def fopenmp_cuda_blocks_per_sm_EQ : Joined<["-"], "fopenmp-cuda-blocks-per-sm=">, Group<f_Group>,
Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
def fopenmp_cuda_teams_reduction_recs_num_EQ : Joined<["-"], "fopenmp-cuda-teams-reduction-recs-num=">, Group<f_Group>,
- Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>;
+ Flags<[NoArgumentUnused, HelpHidden]>, Visibility<[ClangOption, CC1Option]>,
+ HelpText<"Deprecated and ignored. The teams reduction buffer is sized "
+ "automatically at kernel launch to match the actual number of "
+ "teams; this flag is accepted for backwards compatibility only "
+ "and emits a deprecation warning when used.">;
//===----------------------------------------------------------------------===//
// Shared cc1 + fc1 OpenMP Target Options
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index cb0e7297f1a89..19e8c73884dfc 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -788,8 +788,7 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
? 0
: DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
CGBuilderTy &Bld = CGF.Builder;
- OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
- C.getLangOpts().OpenMPCUDAReductionBufNum);
+ OMPBuilder.createTargetDeinit(Bld, ReductionDataSize);
TeamsReductions.clear();
}
@@ -1698,8 +1697,6 @@ void CGOpenMPRuntimeGPU::emitReduction(
bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
- ASTContext &C = CGM.getContext();
-
if (Options.SimpleReduction) {
assert(!TeamsReduction && !ParallelReduction &&
"Invalid reduction selection in emitReduction.");
@@ -1790,12 +1787,13 @@ void CGOpenMPRuntimeGPU::emitReduction(
Idx++;
}
+ bool IsSPMD = getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD;
llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
cantFail(OMPBuilder.createReductionsGPU(
OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, /*IsByRef=*/{}, false,
- TeamsReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
- CGF.getTarget().getGridValue(),
- C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc));
+ TeamsReduction, IsSPMD,
+ llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
+ CGF.getTarget().getGridValue(), RTLoc));
CGF.Builder.restoreIP(AfterIP);
}
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 906abd1474b6c..7763b33567d87 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6952,8 +6952,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("-fno-openmp-extensions");
Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_number_of_sm_EQ);
Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_cuda_blocks_per_sm_EQ);
- Args.AddAllArgs(CmdArgs,
- options::OPT_fopenmp_cuda_teams_reduction_recs_num_EQ);
+ // '-fopenmp-cuda-teams-reduction-recs-num=' is deprecated and has no
+ // effect: the teams reduction buffer is sized at kernel launch by the
+ // offload plugin to match the actual number of teams. Honoring a
+ // smaller user-supplied value would silently truncate the buffer for
+ // larger launches.
+ if (Arg *A = Args.getLastArg(
+ options::OPT_fopenmp_cuda_teams_reduction_recs_num_EQ))
+ D.Diag(diag::warn_drv_deprecated_custom)
+ << A->getAsString(Args)
+ << "the value is ignored; the teams reduction buffer is sized "
+ "automatically at kernel launch";
if (Args.hasFlag(options::OPT_fopenmp_optimistic_collapse,
options::OPT_fno_openmp_optimistic_collapse,
/*Default=*/false))
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index d2847739e3143..ca2d02c7dbd97 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3910,10 +3910,6 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
GenerateArg(Consumer, OPT_fopenmp_cuda_blocks_per_sm_EQ,
Twine(Opts.OpenMPCUDABlocksPerSM));
- if (Opts.OpenMPCUDAReductionBufNum != 1024)
- GenerateArg(Consumer, OPT_fopenmp_cuda_teams_reduction_recs_num_EQ,
- Twine(Opts.OpenMPCUDAReductionBufNum));
-
if (!Opts.OMPTargetTriples.empty()) {
std::string Targets;
llvm::raw_string_ostream OS(Targets);
@@ -4359,9 +4355,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
Opts.OpenMPCUDABlocksPerSM =
getLastArgIntValue(Args, options::OPT_fopenmp_cuda_blocks_per_sm_EQ,
Opts.OpenMPCUDABlocksPerSM, Diags);
- Opts.OpenMPCUDAReductionBufNum = getLastArgIntValue(
- Args, options::OPT_fopenmp_cuda_teams_reduction_recs_num_EQ,
- Opts.OpenMPCUDAReductionBufNum, Diags);
}
// Set the value of the debugging flag used in the new offloading device RTL.
diff --git a/clang/test/Driver/openmp-offload-gpu.c b/clang/test/Driver/openmp-offload-gpu.c
index b5644e6ae445f..4ad0aede8a50e 100644
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@@ -193,8 +193,8 @@
// RUN: %clang -### -nogpulib -nogpuinc -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 %s -fopenmp-cuda-teams-reduction-recs-num=2048 2>&1 \
// RUN: | FileCheck -check-prefix=CUDA_RED_RECS %s
-// CUDA_RED_RECS: "-cc1"{{.*}}"-triple" "nvptx64-nvidia-cuda"
-// CUDA_RED_RECS-SAME: "-fopenmp-cuda-teams-reduction-recs-num=2048"
+// CUDA_RED_RECS: warning: argument '-fopenmp-cuda-teams-reduction-recs-num=2048' is deprecated, the value is ignored; the teams reduction buffer is sized automatically at kernel launch
+// CUDA_RED_RECS-NOT: "-fopenmp-cuda-teams-reduction-recs-num=2048"
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
// RUN: --offload-arch=sm_52 --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc %s 2>&1 \
diff --git a/clang/test/OpenMP/bug60602.cpp b/clang/test/OpenMP/bug60602.cpp
index e9174d7be3a12..8235a5a7d83d1 100644
--- a/clang/test/OpenMP/bug60602.cpp
+++ b/clang/test/OpenMP/bug60602.cpp
@@ -119,7 +119,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
// CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [6 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds [6 x i64], ptr [[DOTOFFLOAD_SIZES]], i32 0, i32 0
// CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK-NEXT: store i32 4, ptr [[TMP37]], align 4
+// CHECK-NEXT: store i32 5, ptr [[TMP37]], align 4
// CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK-NEXT: store i32 6, ptr [[TMP38]], align 4
// CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -223,7 +223,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
// CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP89]], 1
// CHECK-NEXT: [[TMP90:%.*]] = zext i32 [[ADD]] to i64
// CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 0
-// CHECK-NEXT: store i32 4, ptr [[TMP91]], align 4
+// CHECK-NEXT: store i32 5, ptr [[TMP91]], align 4
// CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 1
// CHECK-NEXT: store i32 6, ptr [[TMP92]], align 4
// CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS14]], i32 0, i32 2
diff --git a/clang/test/OpenMP/declare_target_local_codegen.cpp b/clang/test/OpenMP/declare_target_local_codegen.cpp
index b82e8b3bba9ff..fe3a46aed5916 100644
--- a/clang/test/OpenMP/declare_target_local_codegen.cpp
+++ b/clang/test/OpenMP/declare_target_local_codegen.cpp
@@ -130,7 +130,7 @@ int use_new_local_vars() {
// HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// HOST-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// HOST-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// HOST-NEXT: store i32 4, ptr [[TMP8]], align 4
+// HOST-NEXT: store i32 5, ptr [[TMP8]], align 4
// HOST-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// HOST-NEXT: store i32 2, ptr [[TMP9]], align 4
// HOST-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -217,7 +217,7 @@ int use_new_local_vars() {
// HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// HOST-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// HOST-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// HOST-NEXT: store i32 4, ptr [[TMP8]], align 4
+// HOST-NEXT: store i32 5, ptr [[TMP8]], align 4
// HOST-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// HOST-NEXT: store i32 2, ptr [[TMP9]], align 4
// HOST-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -291,7 +291,7 @@ int use_new_local_vars() {
// HOST-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// HOST-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// HOST-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// HOST-NEXT: store i32 4, ptr [[TMP8]], align 4
+// HOST-NEXT: store i32 5, ptr [[TMP8]], align 4
// HOST-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// HOST-NEXT: store i32 2, ptr [[TMP9]], align 4
// HOST-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp
index 62b7ad8b979a2..afd18e91911dd 100644
--- a/clang/test/OpenMP/distribute_codegen.cpp
+++ b/clang/test/OpenMP/distribute_codegen.cpp
@@ -169,7 +169,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT: store i32 4, ptr [[TMP21]], align 4
+// CHECK1-NEXT: store i32 5, ptr [[TMP21]], align 4
// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK1-NEXT: store i32 5, ptr [[TMP22]], align 4
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -368,7 +368,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT: store i32 4, ptr [[TMP21]], align 4
+// CHECK1-NEXT: store i32 5, ptr [[TMP21]], align 4
// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK1-NEXT: store i32 5, ptr [[TMP22]], align 4
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -567,7 +567,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK1-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// CHECK1-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// CHECK1-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT: store i32 4, ptr [[TMP21]], align 4
+// CHECK1-NEXT: store i32 5, ptr [[TMP21]], align 4
// CHECK1-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK1-NEXT: store i32 5, ptr [[TMP22]], align 4
// CHECK1-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -774,7 +774,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
// CHECK1-NEXT: [[TMP13:%.*]] = zext i32 [[ADD4]] to i64
// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT: store i32 4, ptr [[TMP14]], align 4
+// CHECK1-NEXT: store i32 5, ptr [[TMP14]], align 4
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK1-NEXT: store i32 2, ptr [[TMP15]], align 4
// CHECK1-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -949,7 +949,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK1-NEXT: store i32 4, ptr [[TMP10]], align 4
+// CHECK1-NEXT: store i32 5, ptr [[TMP10]], align 4
// CHECK1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK1-NEXT: store i32 2, ptr [[TMP11]], align 4
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1130,7 +1130,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT: store i32 4, ptr [[TMP21]], align 4
+// CHECK3-NEXT: store i32 5, ptr [[TMP21]], align 4
// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK3-NEXT: store i32 5, ptr [[TMP22]], align 4
// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1325,7 +1325,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT: store i32 4, ptr [[TMP21]], align 4
+// CHECK3-NEXT: store i32 5, ptr [[TMP21]], align 4
// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK3-NEXT: store i32 5, ptr [[TMP22]], align 4
// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1520,7 +1520,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP21:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT: store i32 4, ptr [[TMP21]], align 4
+// CHECK3-NEXT: store i32 5, ptr [[TMP21]], align 4
// CHECK3-NEXT: [[TMP22:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK3-NEXT: store i32 5, ptr [[TMP22]], align 4
// CHECK3-NEXT: [[TMP23:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1723,7 +1723,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
// CHECK3-NEXT: [[TMP13:%.*]] = zext i32 [[ADD4]] to i64
// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0
-// CHECK3-NEXT: store i32 4, ptr [[TMP14]], align 4
+// CHECK3-NEXT: store i32 5, ptr [[TMP14]], align 4
// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1
// CHECK3-NEXT: store i32 2, ptr [[TMP15]], align 4
// CHECK3-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2
@@ -1898,7 +1898,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
// CHECK3-NEXT: [[T...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/205071
More information about the Openmp-commits
mailing list