[llvm] [OpenMP] Associate the KernelEnvironment with the GenericKernelTy (PR #70383)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 26 14:51:53 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-codegen
Author: Johannes Doerfert (jdoerfert)
<details>
<summary>Changes</summary>
By associating the kernel environment with the generic kernel we can
access middle-end information easily, including the launch bounds ranges
that are acceptable. By constraining the number of threads accordingly,
we now obey the user-provided bounds that were passed via attributes.
---
Patch is 1.54 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/70383.diff
25 Files Affected:
- (modified) clang/lib/CodeGen/CGOpenMPRuntime.cpp (+45-48)
- (modified) clang/lib/CodeGen/CGOpenMPRuntime.h (+9-1)
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+20-10)
- (modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.h (+2-2)
- (modified) clang/test/OpenMP/distribute_simd_codegen.cpp (+508-508)
- (modified) clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp (+11-98)
- (modified) clang/test/OpenMP/target_parallel_codegen.cpp (+110-110)
- (modified) clang/test/OpenMP/target_parallel_debug_codegen.cpp (+420-420)
- (modified) clang/test/OpenMP/target_parallel_for_codegen.cpp (+314-314)
- (modified) clang/test/OpenMP/target_parallel_for_debug_codegen.cpp (+589-589)
- (modified) clang/test/OpenMP/target_parallel_for_simd_codegen.cpp (+932-932)
- (modified) clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp (+589-589)
- (modified) clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp (+998-998)
- (modified) clang/test/OpenMP/teams_distribute_simd_codegen.cpp (+206-206)
- (modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+27-26)
- (modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+55-60)
- (modified) llvm/lib/Transforms/IPO/OpenMPOpt.cpp (+4-2)
- (modified) openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp (+3-5)
- (modified) openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp (+27-47)
- (modified) openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h (+19-20)
- (modified) openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp (+3-5)
- (modified) openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp (+8-12)
- (modified) openmp/libomptarget/test/offloading/default_thread_limit.c (+1-2)
- (modified) openmp/libomptarget/test/offloading/thread_state_1.c (+2-2)
- (modified) openmp/libomptarget/test/offloading/thread_state_2.c (+2-2)
``````````diff
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 6262b3416a1730a..c1be7c2d0321589 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6002,6 +6002,42 @@ void CGOpenMPRuntime::emitUsesAllocatorsFini(CodeGenFunction &CGF,
{ThreadId, AllocatorVal});
}
+void CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams(
+ const OMPExecutableDirective &D, CodeGenFunction &CGF,
+ int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal,
+ int32_t &MaxTeamsVal) {
+
+ getNumTeamsExprForTargetDirective(CGF, D, MinTeamsVal, MaxTeamsVal);
+ getNumThreadsExprForTargetDirective(CGF, D, MaxThreadsVal,
+ /*UpperBoundOnly=*/true);
+
+ for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
+ for (auto *A : C->getAttrs()) {
+ int32_t AttrMinThreadsVal = 1, AttrMaxThreadsVal = -1;
+ int32_t AttrMinBlocksVal = 1, AttrMaxBlocksVal = -1;
+ if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
+ CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &AttrMaxThreadsVal,
+ &AttrMinBlocksVal, &AttrMaxBlocksVal);
+ else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
+ CGM.handleAMDGPUFlatWorkGroupSizeAttr(
+ nullptr, Attr, /*ReqdWGS=*/nullptr, &AttrMinThreadsVal,
+ &AttrMaxThreadsVal);
+ else
+ continue;
+
+ MinThreadsVal = std::max(MinThreadsVal, AttrMinThreadsVal);
+ if (AttrMaxThreadsVal > 0)
+ MaxThreadsVal = MaxThreadsVal > 0
+ ? std::min(MaxThreadsVal, AttrMaxThreadsVal)
+ : AttrMaxThreadsVal;
+ MinTeamsVal = std::max(MinTeamsVal, AttrMinBlocksVal);
+ if (AttrMaxBlocksVal > 0)
+ MaxTeamsVal = MaxTeamsVal > 0 ? std::min(MaxTeamsVal, AttrMaxBlocksVal)
+ : AttrMaxBlocksVal;
+ }
+ }
+}
+
void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
const OMPExecutableDirective &D, StringRef ParentName,
llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
@@ -6020,47 +6056,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
return CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc());
};
- // Get NumTeams and ThreadLimit attributes
- int32_t DefaultValMinTeams = 1;
- int32_t DefaultValMaxTeams = -1;
- uint32_t DefaultValMinThreads = 1;
- uint32_t DefaultValMaxThreads = UINT32_MAX;
-
- getNumTeamsExprForTargetDirective(CGF, D, DefaultValMinTeams,
- DefaultValMaxTeams);
- getNumThreadsExprForTargetDirective(CGF, D, DefaultValMaxThreads,
- /*UpperBoundOnly=*/true);
-
- for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
- for (auto *A : C->getAttrs()) {
- int32_t MinThreadsVal = 1, MaxThreadsVal = 0;
- int32_t MinBlocksVal = 1, MaxBlocksVal = -1;
- if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
- CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &MaxThreadsVal,
- &MinBlocksVal, &MaxBlocksVal);
- else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
- CGM.handleAMDGPUFlatWorkGroupSizeAttr(
- nullptr, Attr, /*ReqdWGS=*/nullptr, &MinThreadsVal, &MaxThreadsVal);
- else
- continue;
-
- DefaultValMinThreads =
- std::max(DefaultValMinThreads, uint32_t(MinThreadsVal));
- DefaultValMaxThreads =
- DefaultValMaxThreads
- ? std::min(DefaultValMaxThreads, uint32_t(MaxThreadsVal))
- : MaxThreadsVal;
- DefaultValMinTeams = DefaultValMinTeams
- ? std::max(DefaultValMinTeams, MinBlocksVal)
- : MinBlocksVal;
- DefaultValMaxTeams = std::min(DefaultValMaxTeams, MaxBlocksVal);
- }
- }
-
- OMPBuilder.emitTargetRegionFunction(
- EntryInfo, GenerateOutlinedFunction, DefaultValMinTeams,
- DefaultValMaxTeams, DefaultValMinThreads, DefaultValMaxThreads,
- IsOffloadEntry, OutlinedFn, OutlinedFnID);
+ OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
+ IsOffloadEntry, OutlinedFn, OutlinedFnID);
if (!OutlinedFn)
return;
@@ -6306,7 +6303,7 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
/// store the condition in \p CondVal. If \p E, and \p CondVal respectively, are
/// nullptr, no expression evaluation is perfomed.
static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
- const Expr **E, uint32_t &UpperBound,
+ const Expr **E, int32_t &UpperBound,
bool UpperBoundOnly, llvm::Value **CondVal) {
const Stmt *Child = CGOpenMPRuntime::getSingleCompoundChild(
CGF.getContext(), CS->getCapturedStmt());
@@ -6368,10 +6365,10 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
UpperBound
? Constant->getZExtValue()
: std::min(UpperBound,
- static_cast<uint32_t>(Constant->getZExtValue()));
+ static_cast<int32_t>(Constant->getZExtValue()));
// If we haven't found a upper bound, remember we saw a thread limiting
// clause.
- if (UpperBound == UINT32_MAX)
+ if (UpperBound == -1)
UpperBound = 0;
if (!E)
return;
@@ -6397,7 +6394,7 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
}
const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective(
- CodeGenFunction &CGF, const OMPExecutableDirective &D, uint32_t &UpperBound,
+ CodeGenFunction &CGF, const OMPExecutableDirective &D, int32_t &UpperBound,
bool UpperBoundOnly, llvm::Value **CondVal, const Expr **ThreadLimitExpr) {
assert((!CGF.getLangOpts().OpenMPIsTargetDevice || UpperBoundOnly) &&
"Clauses associated with the teams directive expected to be emitted "
@@ -6414,11 +6411,11 @@ const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective(
if (auto Constant = E->getIntegerConstantExpr(CGF.getContext()))
UpperBound = UpperBound ? Constant->getZExtValue()
: std::min(UpperBound,
- uint32_t(Constant->getZExtValue()));
+ int32_t(Constant->getZExtValue()));
}
// If we haven't found a upper bound, remember we saw a thread limiting
// clause.
- if (UpperBound == UINT32_MAX)
+ if (UpperBound == -1)
UpperBound = 0;
if (EPtr)
*EPtr = E;
@@ -6562,7 +6559,7 @@ llvm::Value *CGOpenMPRuntime::emitNumThreadsForTargetDirective(
llvm::Value *CondVal = nullptr;
llvm::Value *ThreadLimitVal = nullptr;
const Expr *ThreadLimitExpr = nullptr;
- uint32_t UpperBound = -1;
+ int32_t UpperBound = -1;
const Expr *NT = getNumThreadsExprForTargetDirective(
CGF, D, UpperBound, /* UpperBoundOnly */ false, &CondVal,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index d2f922da3320924..0c4ad46e881b9c5 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -311,6 +311,14 @@ class CGOpenMPRuntime {
/// An OpenMP-IR-Builder instance.
llvm::OpenMPIRBuilder OMPBuilder;
+ /// Helper to determine the min/max number of threads/teams for \p D.
+ void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D,
+ CodeGenFunction &CGF,
+ int32_t &MinThreadsVal,
+ int32_t &MaxThreadsVal,
+ int32_t &MinTeamsVal,
+ int32_t &MaxTeamsVal);
+
/// Helper to emit outlined function for 'target' directive.
/// \param D Directive to emit.
/// \param ParentName Name of the function that encloses the target region.
@@ -649,7 +657,7 @@ class CGOpenMPRuntime {
/// UpperBoundOnly is true, no expression evaluation is perfomed.
const Expr *getNumThreadsExprForTargetDirective(
CodeGenFunction &CGF, const OMPExecutableDirective &D,
- uint32_t &UpperBound, bool UpperBoundOnly,
+ int32_t &UpperBound, bool UpperBoundOnly,
llvm::Value **CondExpr = nullptr, const Expr **ThreadLimitExpr = nullptr);
/// Emit an expression that denotes the number of threads a target region
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 152a7511f4dd1b0..9d00ebae702802a 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -757,13 +757,15 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
// Emit target region as a standalone region.
class NVPTXPrePostActionTy : public PrePostActionTy {
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
+ const OMPExecutableDirective &D;
public:
- NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
- : EST(EST) {}
+ NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
+ const OMPExecutableDirective &D)
+ : EST(EST), D(D) {}
void Enter(CodeGenFunction &CGF) override {
auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
+ RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
@@ -772,7 +774,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
}
- } Action(EST);
+ } Action(EST, D);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
@@ -780,10 +782,17 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
IsInTTDRegion = false;
}
-void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
+void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
+ CodeGenFunction &CGF,
EntryFunctionState &EST, bool IsSPMD) {
+ int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
+ MaxTeamsVal = -1;
+ computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
+ MinTeamsVal, MaxTeamsVal);
+
CGBuilderTy &Bld = CGF.Builder;
- Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
+ Bld.restoreIP(OMPBuilder.createTargetInit(
+ Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
if (!IsSPMD)
emitGenericVarsProlog(CGF, EST.Loc);
}
@@ -815,19 +824,20 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
CGOpenMPRuntimeGPU::EntryFunctionState &EST;
bool IsBareKernel;
DataSharingMode Mode;
+ const OMPExecutableDirective &D;
public:
NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
CGOpenMPRuntimeGPU::EntryFunctionState &EST,
- bool IsBareKernel)
+ bool IsBareKernel, const OMPExecutableDirective &D)
: RT(RT), EST(EST), IsBareKernel(IsBareKernel),
- Mode(RT.CurrentDataSharingMode) {}
+ Mode(RT.CurrentDataSharingMode), D(D) {}
void Enter(CodeGenFunction &CGF) override {
if (IsBareKernel) {
RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
return;
}
- RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
+ RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
// Skip target region initialization.
RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
}
@@ -839,7 +849,7 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
RT.clearLocThreadIdInsertPt(CGF);
RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
}
- } Action(*this, EST, IsBareKernel);
+ } Action(*this, EST, IsBareKernel, D);
CodeGen.setAction(Action);
IsInTTDRegion = true;
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index c4501a1a2a496b0..46e1361f2f895ba 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -60,8 +60,8 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
void syncCTAThreads(CodeGenFunction &CGF);
/// Helper for target directive initialization.
- void emitKernelInit(CodeGenFunction &CGF, EntryFunctionState &EST,
- bool IsSPMD);
+ void emitKernelInit(const OMPExecutableDirective &D, CodeGenFunction &CGF,
+ EntryFunctionState &EST, bool IsSPMD);
/// Helper for target directive finalization.
void emitKernelDeinit(CodeGenFunction &CGF, EntryFunctionState &EST,
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index 297f508575d99d9..f74abbe32e454f6 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -220,7 +220,7 @@ int fint(void) { return ftemplate<int>(); }
// CHECK1-NEXT: [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
// CHECK1-NEXT: br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
// CHECK1: omp_offload.failed:
-// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4:[0-9]+]]
+// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]]
// CHECK1-NEXT: br label [[OMP_OFFLOAD_CONT]]
// CHECK1: omp_offload.cont:
// CHECK1-NEXT: ret void
@@ -242,7 +242,7 @@ int fint(void) { return ftemplate<int>(); }
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
// CHECK1-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -291,45 +291,45 @@ int fint(void) { return ftemplate<int>(); }
// CHECK1-NEXT: store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
// CHECK1-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK1: omp.inner.for.cond:
-// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
-// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8:![0-9]+]]
+// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
// CHECK1-NEXT: br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
// CHECK1: omp.inner.for.body:
-// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
// CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
-// CHECK1-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[IDXPROM2:%.*]] = sext i32 [[TMP17]] to i64
// CHECK1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM2]]
-// CHECK1-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[MUL4:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[IDXPROM5:%.*]] = sext i32 [[TMP20]] to i64
// CHECK1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM5]]
-// CHECK1-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP21]]
-// CHECK1-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT: [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[IDXPROM8:%.*]] = sext i32 [[TMP23]] to i64
// CHECK1-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM8]]
-// CHECK1-NEXT: store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
// CHECK1: omp.body.continue:
// CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK1: omp.inner.for.inc:
-// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT: [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
// CHECK1-NEXT: [[ADD10:%.*]] = add...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/70383
More information about the llvm-commits
mailing list