[llvm] a5455e3 - [AMDGPUUnifyDivergentExitNodes] Add NewPM support
Anshil Gandhi via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 16 09:14:52 PDT 2023
Author: Anshil Gandhi
Date: 2023-03-16T16:13:29Z
New Revision: a5455e32b364dabe499ec11722626d4bbaf047ba
URL: https://github.com/llvm/llvm-project/commit/a5455e32b364dabe499ec11722626d4bbaf047ba
DIFF: https://github.com/llvm/llvm-project/commit/a5455e32b364dabe499ec11722626d4bbaf047ba.diff
LOG: [AMDGPUUnifyDivergentExitNodes] Add NewPM support
Meanwhile, use UniformityAnalysis instead of LegacyDivergenceAnalysis to collect divergence info.
Reviewed By: arsenm, sameerds
Differential Revision: https://reviews.llvm.org/D141355
Added:
llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h
Modified:
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 64dc8604e76ac..fe7a287657b00 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -22,6 +22,7 @@
#include "AMDGPURegBankSelect.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
+#include "AMDGPUUnifyDivergentExitNodes.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "GCNVOPDUtils.h"
@@ -655,6 +656,10 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(AMDGPUPromoteKernelArgumentsPass());
return true;
}
+ if (PassName == "amdgpu-unify-divergent-exit-nodes") {
+ PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
+ return true;
+ }
return false;
});
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 5f204f5be51ae..5d59ee47ec430 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -19,6 +19,7 @@
//
//===----------------------------------------------------------------------===//
+#include "AMDGPUUnifyDivergentExitNodes.h"
#include "AMDGPU.h"
#include "SIDefines.h"
#include "llvm/ADT/ArrayRef.h"
@@ -53,25 +54,33 @@ using namespace llvm;
namespace {
-class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+class AMDGPUUnifyDivergentExitNodesImpl {
private:
const TargetTransformInfo *TTI = nullptr;
public:
- static char ID; // Pass identification, replacement for typeid
-
- AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
- initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
- }
+ AMDGPUUnifyDivergentExitNodesImpl() = delete;
+ AMDGPUUnifyDivergentExitNodesImpl(const TargetTransformInfo *TTI)
+ : TTI(TTI) {}
// We can preserve non-critical-edgeness when we unify function exit nodes
- void getAnalysisUsage(AnalysisUsage &AU) const override;
BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
ArrayRef<BasicBlock *> ReturningBlocks,
StringRef Name);
- bool runOnFunction(Function &F) override;
+ bool run(Function &F, DominatorTree &DT, const PostDominatorTree &PDT,
+ const UniformityInfo &UA);
};
+class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+public:
+ static char ID;
+ AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
+ initializeAMDGPUUnifyDivergentExitNodesPass(
+ *PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+};
} // end anonymous namespace
char AMDGPUUnifyDivergentExitNodes::ID = 0;
@@ -79,14 +88,14 @@ char AMDGPUUnifyDivergentExitNodes::ID = 0;
char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
- "Unify divergent function exit nodes", false, false)
+ "Unify divergent function exit nodes", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
-void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const {
if (RequireAndPreserveDomTree)
AU.addRequired<DominatorTreeWrapperPass>();
@@ -132,7 +141,7 @@ static bool isUniformlyReached(const UniformityInfo &UA, BasicBlock &BB) {
return true;
}
-BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
+BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
StringRef Name) {
// Otherwise, we need to insert a new basic block into the function, add a PHI
@@ -180,21 +189,14 @@ BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
return NewRetBlock;
}
-bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
- DominatorTree *DT = nullptr;
- if (RequireAndPreserveDomTree)
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
- auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree &DT,
+ const PostDominatorTree &PDT,
+ const UniformityInfo &UA) {
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
!isa<BranchInst>(PDT.getRoot()->getTerminator())))
return false;
- UniformityInfo &UA =
- getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
SmallVector<BasicBlock *, 4> ReturningBlocks;
@@ -327,3 +329,30 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
return true;
}
+
+bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+ DominatorTree *DT = nullptr;
+ if (RequireAndPreserveDomTree)
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ const auto &PDT =
+ getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ const auto &UA = getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ const auto *TranformInfo =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return AMDGPUUnifyDivergentExitNodesImpl(TranformInfo).run(F, *DT, PDT, UA);
+}
+
+PreservedAnalyses
+AMDGPUUnifyDivergentExitNodesPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ DominatorTree *DT = nullptr;
+ if (RequireAndPreserveDomTree)
+ DT = &AM.getResult<DominatorTreeAnalysis>(F);
+
+ const auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
+ const auto *TransformInfo = &AM.getResult<TargetIRAnalysis>(F);
+ return AMDGPUUnifyDivergentExitNodesImpl(TransformInfo).run(F, *DT, PDT, UA)
+ ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h
new file mode 100644
index 0000000000000..e58925bc01d9e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h
@@ -0,0 +1,31 @@
+//===- AMDGPUUnifyDivergentExitNodes.h ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring
+// there is at most one ret and one unreachable instruction, it ensures there is
+// at most one divergent exiting block.
+//
+// StructurizeCFG can't deal with multi-exit regions formed by branches to
+// multiple return nodes. It is not desirable to structurize regions with
+// uniform branches, so unifying those to the same return block as divergent
+// branches inhibits use of scalar branching. It still can't deal with the case
+// where one branch goes to return, and one unreachable. Replace unreachable in
+// this case with a return.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+
+namespace llvm {
+class AMDGPUUnifyDivergentExitNodesPass
+ : public PassInfoMixin<AMDGPUUnifyDivergentExitNodesPass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 58d5dc20d5ac5..13f8eff94f86b 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -1,36 +1,48 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -p simplifycfg,amdgpu-unify-divergent-exit-nodes %s -S -o - | FileCheck %s --check-prefix=OPT
+; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
define void @nested_inf_loop(i1 %0, i1 %1) {
-; CHECK-LABEL: nested_inf_loop:
-; CHECK-NEXT: %bb.0: ; %BB
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_and_b32_e32 v1, 1, v1
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
-; CHECK-NEXT: s_xor_b64 s[6:7], vcc, -1
-; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: .LBB0_1: ; %BB1
-; CHECK: s_and_b64 s[10:11], exec, s[6:7]
-; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; CHECK-NEXT: s_cbranch_execnz .LBB0_1
-; CHECK-NEXT: %bb.2: ; %BB2
-; CHECK: s_or_b64 exec, exec, s[8:9]
-; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: .LBB0_3: ; %BB4
-; CHECK: s_and_b64 s[10:11], exec, s[4:5]
-; CHECK-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
-; CHECK-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; CHECK-NEXT: s_cbranch_execnz .LBB0_3
-; CHECK-NEXT: %bb.4: ; %loop.exit.guard
-; CHECK: s_or_b64 exec, exec, s[8:9]
-; CHECK-NEXT: s_mov_b64 vcc, 0
-; CHECK-NEXT: s_mov_b64 s[8:9], 0
-; CHECK-NEXT: s_branch .LBB0_1
-; CHECK-NEXT: %bb.5: ; %DummyReturnBlock
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; OPT-LABEL: @nested_inf_loop(
+; OPT-NEXT: BB:
+; OPT-NEXT: br label [[BB1:%.*]]
+; OPT: BB1:
+; OPT-NEXT: [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
+; OPT-NEXT: br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
+; OPT: infloop:
+; OPT-NEXT: br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
+; OPT: DummyReturnBlock:
+; OPT-NEXT: ret void
+;
+; ISA-LABEL: nested_inf_loop:
+; ISA-NEXT: %bb.0: ; %BB
+; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISA-NEXT: v_and_b32_e32 v1, 1, v1
+; ISA-NEXT: v_and_b32_e32 v0, 1, v0
+; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1
+; ISA-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; ISA-NEXT: s_xor_b64 s[6:7], vcc, -1
+; ISA-NEXT: s_mov_b64 s[8:9], 0
+; ISA-NEXT: .LBB0_1: ; %BB1
+; ISA: s_and_b64 s[10:11], exec, s[6:7]
+; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_cbranch_execnz .LBB0_1
+; ISA-NEXT: %bb.2: ; %BB2
+; ISA: s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_mov_b64 s[8:9], 0
+; ISA-NEXT: .LBB0_3: ; %BB4
+; ISA: s_and_b64 s[10:11], exec, s[4:5]
+; ISA-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9]
+; ISA-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_cbranch_execnz .LBB0_3
+; ISA-NEXT: %bb.4: ; %loop.exit.guard
+; ISA: s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT: s_mov_b64 vcc, 0
+; ISA-NEXT: s_mov_b64 s[8:9], 0
+; ISA-NEXT: s_branch .LBB0_1
+; ISA-NEXT: %bb.5: ; %DummyReturnBlock
+; ISA-NEXT: s_setpc_b64 s[30:31]
BB:
br label %BB1
More information about the llvm-commits
mailing list