[llvm] 8ac7210 - [AMDGPU] SelectionDAG divergence tracking should take into account Target divergency. (#144947)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 3 09:37:40 PDT 2025
Author: alex-t
Date: 2025-07-03T18:37:37+02:00
New Revision: 8ac7210b7f0ad49ae7809bf6a9faf2f7433384b0
URL: https://github.com/llvm/llvm-project/commit/8ac7210b7f0ad49ae7809bf6a9faf2f7433384b0
DIFF: https://github.com/llvm/llvm-project/commit/8ac7210b7f0ad49ae7809bf6a9faf2f7433384b0.diff
LOG: [AMDGPU] SelectionDAG divergence tracking should take into account Target divergency. (#144947)
If a kernel is known to be executing only a single lane, IR
UniformityAnalysis will take note of that (via
GCNTTIImpl::hasBranchDivergence) and report that all values are uniform.
SelectionDAG's built-in divergence tracking should do the same.
Added:
llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll
Modified:
llvm/include/llvm/CodeGen/SelectionDAG.h
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index a98e46c587273..66578bdaa2781 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -238,6 +238,8 @@ class SelectionDAG {
LLVMContext *Context;
CodeGenOptLevel OptLevel;
+ bool DivergentTarget = false;
+
UniformityInfo *UA = nullptr;
FunctionLoweringInfo * FLI = nullptr;
@@ -471,14 +473,16 @@ class SelectionDAG {
Pass *PassPtr, const TargetLibraryInfo *LibraryInfo,
UniformityInfo *UA, ProfileSummaryInfo *PSIin,
BlockFrequencyInfo *BFIin, MachineModuleInfo &MMI,
- FunctionVarLocs const *FnVarLocs);
+ FunctionVarLocs const *FnVarLocs, bool HasDivergency);
void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE,
MachineFunctionAnalysisManager &AM,
const TargetLibraryInfo *LibraryInfo, UniformityInfo *UA,
ProfileSummaryInfo *PSIin, BlockFrequencyInfo *BFIin,
- MachineModuleInfo &MMI, FunctionVarLocs const *FnVarLocs) {
- init(NewMF, NewORE, nullptr, LibraryInfo, UA, PSIin, BFIin, MMI, FnVarLocs);
+ MachineModuleInfo &MMI, FunctionVarLocs const *FnVarLocs,
+ bool HasDivergency) {
+ init(NewMF, NewORE, nullptr, LibraryInfo, UA, PSIin, BFIin, MMI, FnVarLocs,
+ HasDivergency);
MFAM = &AM;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 6df21b624137f..024d5661f0cb0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1370,7 +1370,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
const TargetLibraryInfo *LibraryInfo,
UniformityInfo *NewUA, ProfileSummaryInfo *PSIin,
BlockFrequencyInfo *BFIin, MachineModuleInfo &MMIin,
- FunctionVarLocs const *VarLocs) {
+ FunctionVarLocs const *VarLocs, bool HasDivergency) {
MF = &NewMF;
SDAGISelPass = PassPtr;
ORE = &NewORE;
@@ -1383,6 +1383,7 @@ void SelectionDAG::init(MachineFunction &NewMF,
BFI = BFIin;
MMI = &MMIin;
FnVarLocs = VarLocs;
+ DivergentTarget = HasDivergency;
}
SelectionDAG::~SelectionDAG() {
@@ -2329,7 +2330,8 @@ SDValue SelectionDAG::getRegister(Register Reg, EVT VT) {
return SDValue(E, 0);
auto *N = newSDNode<RegisterSDNode>(Reg, VTs);
- N->SDNodeBits.IsDivergent = TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
+ N->SDNodeBits.IsDivergent =
+ DivergentTarget && TLI->isSDNodeSourceOfDivergence(N, FLI, UA);
CSEMap.InsertNode(N, IP);
InsertNode(N);
return SDValue(N, 0);
@@ -12067,6 +12069,8 @@ static bool gluePropagatesDivergence(const SDNode *Node) {
}
bool SelectionDAG::calculateDivergence(SDNode *N) {
+ if(!DivergentTarget)
+ return false;
if (TLI->isSDNodeAlwaysUniform(N)) {
assert(!TLI->isSDNodeSourceOfDivergence(N, FLI, UA) &&
"Conflicting divergence information!");
@@ -12086,6 +12090,8 @@ bool SelectionDAG::calculateDivergence(SDNode *N) {
}
void SelectionDAG::updateDivergence(SDNode *N) {
+ if (!DivergentTarget)
+ return;
SmallVector<SDNode *, 16> Worklist(1, N);
do {
N = Worklist.pop_back_val();
@@ -13633,16 +13639,20 @@ void SelectionDAG::createOperands(SDNode *Node, ArrayRef<SDValue> Vals) {
Ops[I].setInitial(Vals[I]);
EVT VT = Ops[I].getValueType();
+ // Take care of the Node's operands iff target has divergence
// Skip Chain. It does not carry divergence.
- if (VT != MVT::Other &&
+ if (DivergentTarget && VT != MVT::Other &&
(VT != MVT::Glue || gluePropagatesDivergence(Ops[I].getNode())) &&
Ops[I].getNode()->isDivergent()) {
+ // Node is going to be divergent if at least one of its operand is
+ // divergent, unless it belongs to the "AlwaysUniform" exemptions.
IsDivergent = true;
}
}
Node->NumOperands = Vals.size();
Node->OperandList = Ops;
- if (!TLI->isSDNodeAlwaysUniform(Node)) {
+ // Check the divergence of the Node itself.
+ if (DivergentTarget && !TLI->isSDNodeAlwaysUniform(Node)) {
IsDivergent |= TLI->isSDNodeSourceOfDivergence(Node, FLI, UA);
Node->SDNodeBits.IsDivergent = IsDivergent;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index d9b9cf6bcc772..6260ad2f24dea 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -482,7 +482,10 @@ void SelectionDAGISel::initializeAnalysisResults(
MachineModuleInfo &MMI =
MAMP.getCachedResult<MachineModuleAnalysis>(*Fn.getParent())->getMMI();
- CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
+ TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
+
+ CurDAG->init(*MF, *ORE, MFAM, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
+ TTI->hasBranchDivergence(&Fn));
// Now get the optional analyzes if we want to.
// This is based on the possibly changed OptLevel (after optnone is taken
@@ -500,10 +503,6 @@ void SelectionDAGISel::initializeAnalysisResults(
BatchAA = std::nullopt;
SP = &FAM.getResult<SSPLayoutAnalysis>(Fn);
-
-#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
- TTI = &FAM.getResult<TargetIRAnalysis>(Fn);
-#endif
}
void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
@@ -539,7 +538,10 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
MachineModuleInfo &MMI =
MFP.getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
- CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs);
+ TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
+
+ CurDAG->init(*MF, *ORE, &MFP, LibInfo, UA, PSI, BFI, MMI, FnVarLocs,
+ TTI->hasBranchDivergence(&Fn));
// Now get the optional analyzes if we want to.
// This is based on the possibly changed OptLevel (after optnone is taken
@@ -558,10 +560,6 @@ void SelectionDAGISel::initializeAnalysisResults(MachineFunctionPass &MFP) {
BatchAA = std::nullopt;
SP = &MFP.getAnalysis<StackProtector>().getLayoutInfo();
-
-#if !defined(NDEBUG) && LLVM_ENABLE_ABI_BREAKING_CHECKS
- TTI = &MFP.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
-#endif
}
bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
index 13ea8b08d2ade..91a8446915f66 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-always-uniform.ll
@@ -18,9 +18,10 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x8
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_or3_b32 v2, s2, v2, s0
+; GFX11-NEXT: s_or_b32 s1, s2, s3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_or_b32 s0, s0, s1
+; GFX11-NEXT: v_mov_b32_e32 v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: s_endpgm
;
@@ -33,12 +34,14 @@ define amdgpu_cs void @test_uniform_load_b96(ptr addrspace(1) %ptr, i32 %arg) "a
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v1, v3, vcc_lo
; GFX12-NEXT: v_readfirstlane_b32 s0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX12-NEXT: v_readfirstlane_b32 s1, v3
; GFX12-NEXT: s_load_b96 s[0:2], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_or_b32 s0, s0, s1
+; GFX12-NEXT: s_or_b32 s0, s2, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: v_mov_b32_e32 v2, s0
-; GFX12-NEXT: v_or3_b32 v2, v2, s1, s2
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
; GFX12-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll b/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll
new file mode 100644
index 0000000000000..726e35d6651d0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GCN %s
+
+declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32)
+
+
+define amdgpu_kernel void @test_isel_single_lane(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: test_isel_single_lane:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_load_b32 s4, s[0:1], 0x58
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
+; GCN-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GCN-NEXT: s_wait_loadcnt 0x0
+; GCN-NEXT: v_readfirstlane_b32 s0, v1
+; GCN-NEXT: s_addk_co_i32 s0, 0xf4
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_lshl_b32 s1, s0, 4
+; GCN-NEXT: s_mul_i32 s0, s0, s1
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_lshl_b32 s0, s0, 12
+; GCN-NEXT: s_sub_co_i32 s0, s1, s0
+; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
+; GCN-NEXT: s_endpgm
+ %gep0 = getelementptr i32, ptr addrspace(1) %in, i32 22
+ %val0 = load i32, ptr addrspace(1) %gep0, align 4
+ %gep1 = getelementptr i32, ptr addrspace(1) %in, i32 4
+ %val1 = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr addrspace(1) %gep1, i32 %val0)
+ %res0 = add i32 %val1, 244
+ %res1 = shl i32 %res0, 4
+ %res2 = mul i32 %res0, %res1
+ %res3 = shl i32 %res2, 12
+ %res4 = sub i32 %res1, %res3
+ store i32 %res4, ptr addrspace(1) %out
+ ret void
+}
+
+
+attributes #0 = {
+ "amdgpu-flat-work-group-size"="1,1"
+ "amdgpu-waves-per-eu"="1,1"
+ "uniform-work-group-size"="true"
+}
diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
index 360fc9c8af842..d3f1a87572e2a 100644
--- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp
@@ -9,6 +9,7 @@
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -78,8 +79,12 @@ class SelectionDAGAddressAnalysisTest : public testing::Test {
if (!DAG)
report_fatal_error("DAG?");
OptimizationRemarkEmitter ORE(F);
+ FunctionAnalysisManager FAM;
+ FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
+
+ TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(*F);
DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
- nullptr);
+ nullptr, TTI.hasBranchDivergence());
}
TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index baee2868d2d60..433ec7b6ae6c7 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -7,6 +7,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/AsmParser/Parser.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/SDPatternMatch.h"
@@ -76,8 +77,12 @@ class SelectionDAGPatternMatchTest : public testing::Test {
if (!DAG)
report_fatal_error("DAG?");
OptimizationRemarkEmitter ORE(F);
+ FunctionAnalysisManager FAM;
+ FAM.registerPass([&] { return TM->getTargetIRAnalysis(); });
+
+ TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(*F);
DAG->init(*MF, ORE, nullptr, nullptr, nullptr, nullptr, nullptr, MMI,
- nullptr);
+ nullptr, TTI.hasBranchDivergence());
}
TargetLoweringBase::LegalizeTypeAction getTypeAction(EVT VT) {
More information about the llvm-commits
mailing list