[llvm] 03d7e61 - [OpenMP] Internalize functions in OpenMPOpt to improve IPO passes
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 22 09:38:26 PDT 2021
Author: Joseph Huber
Date: 2021-06-22T12:38:10-04:00
New Revision: 03d7e61c87eb94083d22ff55cf30c0a378ab6824
URL: https://github.com/llvm/llvm-project/commit/03d7e61c87eb94083d22ff55cf30c0a378ab6824
DIFF: https://github.com/llvm/llvm-project/commit/03d7e61c87eb94083d22ff55cf30c0a378ab6824.diff
LOG: [OpenMP] Internalize functions in OpenMPOpt to improve IPO passes
Summary:
Currently the attributor needs to give up if a function has external linkage.
This means that the optimization introduced in D97818 will only apply to static
functions. This change uses the Attributor to internalize OpenMP device
routines by making a copy of each function with private linkage and replacing
the uses in the module with it. This allows for the optimization to be applied
to any regular function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D102824
Added:
Modified:
clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
llvm/include/llvm/Transforms/IPO/Attributor.h
llvm/lib/Transforms/IPO/Attributor.cpp
llvm/lib/Transforms/IPO/OpenMPOpt.cpp
llvm/test/Transforms/OpenMP/replace_globalization.ll
llvm/test/Transforms/OpenMP/single_threaded_execution.ll
Removed:
################################################################################
diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
index 72593b96bd1ba..8b45d4dc789e5 100644
--- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
+++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
-// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// host-no-diagnostics
void bar1(void) {
#pragma omp parallel // #0
// all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
- // safe-remark@#0 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
+ // safe-remark@#0 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
// force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: <NONE>}}
{
}
@@ -15,7 +15,7 @@ void bar1(void) {
void bar2(void) {
#pragma omp parallel // #1
// all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}}
- // safe-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
+ // safe-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
// force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: <NONE>}}
{
}
diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
index 604c2f2abfc25..ca6a9afa3b3f5 100644
--- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
+++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -verify=host -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
-// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -verify=host -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
+// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp-opt -Rpass-analysis=openmp-opt -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out
// host-no-diagnostics
void bar(void) {
#pragma omp parallel // #1 \
// expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nested inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \
- // expected-remark@#1 {{Parallel region is used in unexpected ways; will not attempt to rewrite the state machine.}}
+ // expected-remark@#1 {{Parallel region is used in unknown ways; will not attempt to rewrite the state machine.}}
{
}
}
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 88b7848b45b28..a3e1905ecbf70 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1083,13 +1083,15 @@ struct Attributor {
/// the abstract attributes.
/// \param CGUpdater Helper to update an underlying call graph.
/// \param Allowed If not null, a set limiting the attribute opportunities.
- /// \param DeleteFns Whether to delete functions
+ /// \param DeleteFns Whether to delete functions.
+ /// \param RewriteSignatures Whether to rewrite function signatures.
Attributor(SetVector<Function *> &Functions, InformationCache &InfoCache,
CallGraphUpdater &CGUpdater,
- DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true)
+ DenseSet<const char *> *Allowed = nullptr, bool DeleteFns = true,
+ bool RewriteSignatures = true)
: Allocator(InfoCache.Allocator), Functions(Functions),
InfoCache(InfoCache), CGUpdater(CGUpdater), Allowed(Allowed),
- DeleteFns(DeleteFns) {}
+ DeleteFns(DeleteFns), RewriteSignatures(RewriteSignatures) {}
~Attributor();
@@ -1665,6 +1667,21 @@ struct Attributor {
///
static void createShallowWrapper(Function &F);
+ /// Make another copy of the function \p F such that the copied version has
+ /// internal linkage afterwards and can be analysed. Then we replace all uses
+ /// of the original function to the copied one
+ ///
+ /// Only non-locally linked functions that have `linkonce_odr` or `weak_odr`
+ /// linkage can be internalized because these linkages guarantee that other
+ /// definitions with the same name have the same semantics as this one.
+ ///
+ /// This will only be run if the `attributor-allow-deep-wrappers` option is
+ /// set, or if the function is called with \p Force set to true.
+ ///
+ /// If the function \p F failed to be internalized the return value will be a
+ /// null pointer.
+ static Function *internalizeFunction(Function &F, bool Force = false);
+
/// Return the data layout associated with the anchor scope.
const DataLayout &getDataLayout() const { return InfoCache.DL; }
@@ -1777,6 +1794,9 @@ struct Attributor {
/// Whether to delete functions.
const bool DeleteFns;
+ /// Whether to rewrite signatures.
+ const bool RewriteSignatures;
+
/// A set to remember the functions we already assume to be live and visited.
DenseSet<const Function *> VisitedFunctions;
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 8a918edf1cb7c..b4d66b352c536 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -1621,19 +1621,12 @@ void Attributor::createShallowWrapper(Function &F) {
NumFnShallowWrappersCreated++;
}
-/// Make another copy of the function \p F such that the copied version has
-/// internal linkage afterwards and can be analysed. Then we replace all uses
-/// of the original function to the copied one
-///
-/// Only non-exactly defined functions that have `linkonce_odr` or `weak_odr`
-/// linkage can be internalized because these linkages guarantee that other
-/// definitions with the same name have the same semantics as this one
-///
-static Function *internalizeFunction(Function &F) {
- assert(AllowDeepWrapper && "Cannot create a copy if not allowed.");
- assert(!F.isDeclaration() && !F.hasExactDefinition() &&
- !GlobalValue::isInterposableLinkage(F.getLinkage()) &&
- "Trying to internalize function which cannot be internalized.");
+Function *Attributor::internalizeFunction(Function &F, bool Force) {
+ if (!AllowDeepWrapper && !Force)
+ return nullptr;
+ if (F.isDeclaration() || F.hasLocalLinkage() ||
+ GlobalValue::isInterposableLinkage(F.getLinkage()))
+ return nullptr;
Module &M = *F.getParent();
FunctionType *FnTy = F.getFunctionType();
@@ -1663,7 +1656,8 @@ static Function *internalizeFunction(Function &F) {
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
F.getAllMetadata(MDs);
for (auto MDIt : MDs)
- Copied->addMetadata(MDIt.first, *MDIt.second);
+ if (!Copied->hasMetadata())
+ Copied->addMetadata(MDIt.first, *MDIt.second);
M.getFunctionList().insert(F.getIterator(), Copied);
F.replaceAllUsesWith(Copied);
@@ -1675,6 +1669,9 @@ static Function *internalizeFunction(Function &F) {
bool Attributor::isValidFunctionSignatureRewrite(
Argument &Arg, ArrayRef<Type *> ReplacementTypes) {
+ if (!RewriteSignatures)
+ return false;
+
auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
// Forbid the call site to cast the function return type. If we need to
// rewrite these functions we need to re-create a cast for the new call site
@@ -2459,7 +2456,8 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
Function *F = Functions[u];
if (!F->isDeclaration() && !F->isDefinitionExact() && F->getNumUses() &&
!GlobalValue::isInterposableLinkage(F->getLinkage())) {
- Function *NewF = internalizeFunction(*F);
+ Function *NewF = Attributor::internalizeFunction(*F);
+ assert(NewF && "Could not internalize function.");
Functions.insert(NewF);
// Update call graph
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 9b43134e378aa..feee0fae92150 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1623,9 +1623,9 @@ struct OpenMPOpt {
};
GlobalizationRFI.foreachUse(SCC, CreateAA);
- for (auto &F : M) {
- if (!F.isDeclaration())
- A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(F));
+ for (auto *F : SCC) {
+ if (!F->isDeclaration())
+ A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
}
}
};
@@ -2620,11 +2620,19 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
if (DisableOpenMPOptimizations)
return PreservedAnalyses::all();
- // Look at every function definition in the Module.
+ // Create internal copies of each function if this is a kernel Module.
+ DenseSet<const Function *> InternalizedFuncs;
+ if (!OMPInModule.getKernels().empty())
+ for (Function &F : M)
+ if (!F.isDeclaration() && !OMPInModule.getKernels().contains(&F))
+ if (Attributor::internalizeFunction(F, /* Force */ true))
+ InternalizedFuncs.insert(&F);
+
+ // Look at every function definition in the Module that wasn't internalized.
SmallVector<Function *, 16> SCC;
- for (Function &Fn : M)
- if (!Fn.isDeclaration())
- SCC.push_back(&Fn);
+ for (Function &F : M)
+ if (!F.isDeclaration() && !InternalizedFuncs.contains(&F))
+ SCC.push_back(&F);
if (SCC.empty())
return PreservedAnalyses::all();
@@ -2645,7 +2653,7 @@ PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions,
OMPInModule.getKernels());
- Attributor A(Functions, InfoCache, CGUpdater);
+ Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false);
OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
bool Changed = OMPOpt.run(true);
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index 5a513dd1046ca..d50f1f7a86b2b 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -2,6 +2,8 @@
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64"
+ at S = external local_unnamed_addr global i8*
+
; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef
; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef
@@ -67,7 +69,7 @@ exit:
define void @use(i8* %x) {
entry:
%addr = alloca i8*
- store i8* %x, i8** %addr
+ store i8* %x, i8** @S
ret void
}
diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
index 3dbfc9eb8b52c..327905efd559d 100644
--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@@ -1,8 +1,8 @@
-; RUN: opt -passes=openmp-opt-cgscc -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -passes=openmp-opt -debug-only=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
; REQUIRES: asserts
; ModuleID = 'single_threaded_exeuction.c'
-define void @kernel() {
+define weak void @kernel() {
call void @__kmpc_kernel_init(i32 512, i16 1)
call void @nvptx()
call void @amdgcn()
@@ -12,14 +12,15 @@ define void @kernel() {
; CHECK-NOT: [openmp-opt] Basic block @nvptx entry is executed by a single thread.
; CHECK: [openmp-opt] Basic block @nvptx if.then is executed by a single thread.
; CHECK-NOT: [openmp-opt] Basic block @nvptx if.end is executed by a single thread.
-; Function Attrs: noinline nounwind uwtable
-define dso_local void @nvptx() {
+; Function Attrs: noinline
+define internal void @nvptx() {
entry:
%call = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%cmp = icmp eq i32 %call, 0
br i1 %cmp, label %if.then, label %if.end
if.then:
+ call void @foo()
call void @bar()
br label %if.end
@@ -30,14 +31,15 @@ if.end:
; CHECK-NOT: [openmp-opt] Basic block @amdgcn entry is executed by a single thread.
; CHECK: [openmp-opt] Basic block @amdgcn if.then is executed by a single thread.
; CHECK-NOT: [openmp-opt] Basic block @amdgcn if.end is executed by a single thread.
-; Function Attrs: noinline nounwind uwtable
-define dso_local void @amdgcn() {
+; Function Attrs: noinline
+define internal void @amdgcn() {
entry:
%call = call i32 @llvm.amdgcn.workitem.id.x()
%cmp = icmp eq i32 %call, 0
br i1 %cmp, label %if.then, label %if.end
if.then:
+ call void @foo()
call void @bar()
br label %if.end
@@ -45,9 +47,16 @@ if.end:
ret void
}
-; CHECK: [openmp-opt] Basic block @bar entry is executed by a single thread.
-; Function Attrs: noinline nounwind uwtable
-define internal void @bar() {
+; CHECK: [openmp-opt] Basic block @foo entry is executed by a single thread.
+; Function Attrs: noinline
+define internal void @foo() {
+entry:
+ ret void
+}
+
+; CHECK: [openmp-opt] Basic block @bar.internalized entry is executed by a single thread.
+; Function Attrs: noinline
+define void @bar() {
entry:
ret void
}
More information about the llvm-commits
mailing list