[llvm] 56d92c1 - [MachineScheduler] Track physical register dependencies per-regunit
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 7 07:41:53 PDT 2023
Author: Jay Foad
Date: 2023-08-07T15:41:40+01:00
New Revision: 56d92c17583e5f0b5e1e521b5f614be79436fccc
URL: https://github.com/llvm/llvm-project/commit/56d92c17583e5f0b5e1e521b5f614be79436fccc
DIFF: https://github.com/llvm/llvm-project/commit/56d92c17583e5f0b5e1e521b5f614be79436fccc.diff
LOG: [MachineScheduler] Track physical register dependencies per-regunit
Change the scheduler's physical register dependency tracking from
registers-and-their-aliases to regunits. This has a couple of advantages
when subregisters are used:
- The dependency tracking is more accurate and creates fewer useless
edges in the dependency graph. An AMDGPU example, edited for clarity:
SU(0): $vgpr1 = V_MOV_B32 $sgpr0
SU(1): $vgpr1 = V_ADDC_U32 0, $vgpr1
SU(2): $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 $vgpr0_vgpr1, 0, 0
There is a data dependency on $vgpr1 from SU(0) to SU(1) and from
SU(1) to SU(2). But the old dependency tracking code also added a
useless edge from SU(0) to SU(2) because it thought that SU(0)'s def
of $vgpr1 aliased with SU(2)'s use of $vgpr0_vgpr1.
- On targets like AMDGPU that make heavy use of subregisters, each
register can have a huge number of aliases - it can be quadratic in
the size of the largest defined register tuple. There is a much lower
bound on the number of regunits per register, so iterating over
regunits is faster than iterating over aliases.
The LLVM compile-time tracker shows a tiny overall improvement of 0.03%
on X86. I expect a larger compile-time improvement on targets like
AMDGPU.
Recommit after fixing AggressiveAntiDepBreaker in D156880.
Differential Revision: https://reviews.llvm.org/D156552
Added:
Modified:
llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
llvm/test/CodeGen/AMDGPU/load-global-i16.ll
llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll
llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll
llvm/test/CodeGen/Thumb2/mve-vldst4.ll
llvm/test/CodeGen/Thumb2/mve-vst3.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 5ea68e0a64af9c..abffcd5dca16fb 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -77,11 +77,12 @@ namespace llvm {
struct PhysRegSUOper {
SUnit *SU;
int OpIdx;
- unsigned Reg;
+ unsigned RegUnit;
- PhysRegSUOper(SUnit *su, int op, unsigned R): SU(su), OpIdx(op), Reg(R) {}
+ PhysRegSUOper(SUnit *su, int op, unsigned R)
+ : SU(su), OpIdx(op), RegUnit(R) {}
- unsigned getSparseSetIndex() const { return Reg; }
+ unsigned getSparseSetIndex() const { return RegUnit; }
};
/// Use a SparseMultiSet to track physical registers. Storage is only
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 37a1ef0c8d64d2..a42f842b70df7d 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -211,7 +211,8 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
for (const MachineOperand &MO : ExitMI->all_uses()) {
Register Reg = MO.getReg();
if (Reg.isPhysical()) {
- Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
+ for (MCRegUnit Unit : TRI->regunits(Reg))
+ Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
} else if (Reg.isVirtual() && MO.readsReg()) {
addVRegUseDeps(&ExitSU, MO.getOperandNo());
}
@@ -222,8 +223,11 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
// uses all the registers that are livein to the successor blocks.
for (const MachineBasicBlock *Succ : BB->successors()) {
for (const auto &LI : Succ->liveins()) {
- if (!Uses.contains(LI.PhysReg))
- Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg));
+ // TODO: Use LI.LaneMask to refine this.
+ for (MCRegUnit Unit : TRI->regunits(LI.PhysReg)) {
+ if (!Uses.contains(Unit))
+ Uses.insert(PhysRegSUOper(&ExitSU, -1, Unit));
+ }
}
}
}
@@ -244,8 +248,8 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
const MCInstrDesc &DefMIDesc = SU->getInstr()->getDesc();
bool ImplicitPseudoDef = (OperIdx >= DefMIDesc.getNumOperands() &&
!DefMIDesc.hasImplicitDefOfPhysReg(Reg));
- for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
- for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
+ for (MCRegUnit Unit : TRI->regunits(Reg)) {
+ for (Reg2SUnitsMap::iterator I = Uses.find(Unit); I != Uses.end(); ++I) {
SUnit *UseSU = I->SU;
if (UseSU == SU)
continue;
@@ -262,11 +266,14 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
// Set the hasPhysRegDefs only for physreg defs that have a use within
// the scheduling region.
SU->hasPhysRegDefs = true;
+
UseInstr = UseSU->getInstr();
+ Register UseReg = UseInstr->getOperand(UseOpIdx).getReg();
const MCInstrDesc &UseMIDesc = UseInstr->getDesc();
- ImplicitPseudoUse = (UseOpIdx >= ((int)UseMIDesc.getNumOperands()) &&
- !UseMIDesc.hasImplicitUseOfPhysReg(*Alias));
- Dep = SDep(SU, SDep::Data, *Alias);
+ ImplicitPseudoUse = UseOpIdx >= ((int)UseMIDesc.getNumOperands()) &&
+ !UseMIDesc.hasImplicitUseOfPhysReg(UseReg);
+
+ Dep = SDep(SU, SDep::Data, UseReg);
}
if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
@@ -300,15 +307,16 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
// TODO: Using a latency of 1 here for output dependencies assumes
// there's no cost for reusing registers.
SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
- for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
- for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) {
+ for (MCRegUnit Unit : TRI->regunits(Reg)) {
+ for (Reg2SUnitsMap::iterator I = Defs.find(Unit); I != Defs.end(); ++I) {
SUnit *DefSU = I->SU;
if (DefSU == &ExitSU)
continue;
MachineInstr *DefInstr = DefSU->getInstr();
- if (DefSU != SU && (Kind != SDep::Output || !MO.isDead() ||
- !DefInstr->registerDefIsDead(*Alias))) {
- SDep Dep(SU, Kind, /*Reg=*/*Alias);
+ MachineOperand &DefMO = DefInstr->getOperand(I->OpIdx);
+ if (DefSU != SU &&
+ (Kind != SDep::Output || !MO.isDead() || !DefMO.isDead())) {
+ SDep Dep(SU, Kind, DefMO.getReg());
if (Kind != SDep::Anti) {
Dep.setLatency(
SchedModel.computeOutputLatency(MI, OperIdx, DefInstr));
@@ -324,37 +332,42 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
// Either insert a new Reg2SUnits entry with an empty SUnits list, or
// retrieve the existing SUnits list for this register's uses.
// Push this SUnit on the use list.
- Uses.insert(PhysRegSUOper(SU, OperIdx, Reg));
+ for (MCRegUnit Unit : TRI->regunits(Reg))
+ Uses.insert(PhysRegSUOper(SU, OperIdx, Unit));
if (RemoveKillFlags)
MO.setIsKill(false);
} else {
addPhysRegDataDeps(SU, OperIdx);
// Clear previous uses and defs of this register and its subregisters.
- for (MCPhysReg SubReg : TRI->subregs_inclusive(Reg)) {
- Uses.eraseAll(SubReg);
+ for (MCRegUnit Unit : TRI->regunits(Reg)) {
+ Uses.eraseAll(Unit);
if (!MO.isDead())
- Defs.eraseAll(SubReg);
+ Defs.eraseAll(Unit);
}
+
if (MO.isDead() && SU->isCall) {
// Calls will not be reordered because of chain dependencies (see
// below). Since call operands are dead, calls may continue to be added
// to the DefList making dependence checking quadratic in the size of
// the block. Instead, we leave only one call at the back of the
// DefList.
- Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg);
- Reg2SUnitsMap::iterator B = P.first;
- Reg2SUnitsMap::iterator I = P.second;
- for (bool isBegin = I == B; !isBegin; /* empty */) {
- isBegin = (--I) == B;
- if (!I->SU->isCall)
- break;
- I = Defs.erase(I);
+ for (MCRegUnit Unit : TRI->regunits(Reg)) {
+ Reg2SUnitsMap::RangePair P = Defs.equal_range(Unit);
+ Reg2SUnitsMap::iterator B = P.first;
+ Reg2SUnitsMap::iterator I = P.second;
+ for (bool isBegin = I == B; !isBegin; /* empty */) {
+ isBegin = (--I) == B;
+ if (!I->SU->isCall)
+ break;
+ I = Defs.erase(I);
+ }
}
}
// Defs are pushed in the order they are visited and never reordered.
- Defs.insert(PhysRegSUOper(SU, OperIdx, Reg));
+ for (MCRegUnit Unit : TRI->regunits(Reg))
+ Defs.insert(PhysRegSUOper(SU, OperIdx, Unit));
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 165eeb0704ea89..667c561ea26f6f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1363,11 +1363,11 @@ define amdgpu_kernel void @v_copysign_out_f16_mag_f16_sign_f64(ptr addrspace(1)
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
-; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: flat_load_ushort v3, v[0:1]
-; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
+; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bfi_b32 v2, s0, v3, v2
; VI-NEXT: flat_store_short v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 8a7cdf36accb7a..f957e0368c426a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3788,13 +3788,13 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, 0xffff, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xffff, v2
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v1
+; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0
; GCN-NOHSA-VI-NEXT: buffer_store_dword v32, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dword v33, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_store_dword v34, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_store_dword v35, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v1
-; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, 0xffff, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, 0xffff, v0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v29
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
index 395a4f827ad6e7..a6ff60af1604c1 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
+++ b/llvm/test/CodeGen/AMDGPU/schedule-physregdeps.mir
@@ -4,15 +4,11 @@
# CHECK: SU(0): $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec
# CHECK: Successors:
# CHECK-NEXT: SU(2): Out Latency=1
-# CHECK-NEXT: SU(4): Out Latency=1
# CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0
-# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
# CHECK: SU(1): $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec
# CHECK: Successors:
# CHECK-NEXT: SU(3): Out Latency=1
-# CHECK-NEXT: SU(4): Out Latency=1
# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr1
-# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
# CHECK: SU(2): $vgpr0 = V_ADD_CO_U32_e32 $sgpr2, $vgpr0, implicit-def $vcc, implicit $exec
# CHECK: Predecessors:
# CHECK-NEXT: SU(0): Out Latency=1
@@ -22,7 +18,6 @@
# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
# CHECK-NEXT: SU(3): Out Latency=1
# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vcc
-# CHECK-NEXT: SU(4): Anti Latency=0
# CHECK: SU(3): $vgpr1 = V_ADDC_U32_e32 0, $vgpr1, implicit-def dead $vcc, implicit $vcc, implicit $exec
# CHECK: Predecessors:
# CHECK-NEXT: SU(2): Out Latency=1
@@ -32,19 +27,12 @@
# CHECK: Successors:
# CHECK-NEXT: SU(4): Out Latency=1
# CHECK-NEXT: SU(4): Data Latency=1 Reg=$vgpr0_vgpr1
-# CHECK-NEXT: SU(4): Anti Latency=0
# CHECK: SU(4): $vgpr0_vgpr1 = FLAT_LOAD_DWORDX2 renamable $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
# CHECK: Predecessors:
# CHECK-NEXT: SU(3): Out Latency=1
# CHECK-NEXT: SU(3): Data Latency=1 Reg=$vgpr0_vgpr1
-# CHECK-NEXT: SU(3): Anti Latency=0
# CHECK-NEXT: SU(2): Out Latency=1
# CHECK-NEXT: SU(2): Data Latency=1 Reg=$vgpr0_vgpr1
-# CHECK-NEXT: SU(2): Anti Latency=0
-# CHECK-NEXT: SU(1): Out Latency=1
-# CHECK-NEXT: SU(1): Data Latency=1 Reg=$vgpr0_vgpr1
-# CHECK-NEXT: SU(0): Out Latency=1
-# CHECK-NEXT: SU(0): Data Latency=1 Reg=$vgpr0_vgpr1
# CHECK: Successors:
# CHECK-NEXT: ExitSU: Ord Latency=3 Artificial
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll
index f927bdd5390f69..3cbf3d21dec5a1 100644
--- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting-explicit-regs.ll
@@ -35,8 +35,8 @@ define void @__int128_and_f(ptr noalias nocapture writeonly sret(i128) align 8 %
; Z15-LABEL: __int128_and_f:
; Z15: # %bb.0: # %entry
; Z15-NEXT: vl %v0, 0(%r3), 3
-; Z15-NEXT: vrepg %v6, %v0, 1
; Z15-NEXT: vlr %v4, %v0
+; Z15-NEXT: vrepg %v6, %v0, 1
; Z15-NEXT: #APP
; Z15-NEXT: #NO_APP
; Z15-NEXT: vmrhg %v0, %v4, %v6
@@ -260,8 +260,8 @@ entry:
define <4 x i32> @vec128_and_f(<4 x i32> %cc_dep1) {
; CHECK-LABEL: vec128_and_f:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vrepg %v3, %v24, 1
; CHECK-NEXT: vlr %v1, %v24
+; CHECK-NEXT: vrepg %v3, %v24, 1
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmrhg %v24, %v1, %v3
diff --git a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll
index b953fc31f3fcef..23d78a9315b404 100644
--- a/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-fp-int-casting.ll
@@ -249,8 +249,8 @@ entry:
define <4 x i32> @vec128_and_f(<4 x i32> %cc_dep1) {
; CHECK-LABEL: vec128_and_f:
; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: vrepg %v2, %v24, 1
; CHECK-NEXT: vlr %v0, %v24
+; CHECK-NEXT: vrepg %v2, %v24, 1
; CHECK-NEXT: #APP
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmrhg %v24, %v0, %v2
diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
index 2e75ce90eb48c5..219541cffb940f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll
@@ -47,8 +47,8 @@ define void @vldst4(ptr nocapture readonly %pIn, ptr nocapture %pOut, i32 %numRo
; CHECK-NEXT: vmovx.f16 s8, s13
; CHECK-NEXT: vins.f16 s20, s22
; CHECK-NEXT: vins.f16 s16, s18
-; CHECK-NEXT: vins.f16 s25, s6
; CHECK-NEXT: vmov.f32 s2, s5
+; CHECK-NEXT: vins.f16 s25, s6
; CHECK-NEXT: vmov.f32 s3, s17
; CHECK-NEXT: vins.f16 s0, s15
; CHECK-NEXT: vmovx.f16 s9, s21
diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
index 3f92152e189559..7d662d1f1a990f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -1358,8 +1358,8 @@ define void @vst3_v16f16(ptr %src, ptr %dst) {
; CHECK-NEXT: vldrw.u32 q3, [r0]
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov.f32 s16, s12
-; CHECK-NEXT: vins.f16 s16, s24
; CHECK-NEXT: vmov.f32 s0, s13
+; CHECK-NEXT: vins.f16 s16, s24
; CHECK-NEXT: vmov.16 q4[4], r2
; CHECK-NEXT: vins.f16 s0, s25
; CHECK-NEXT: vmov.f32 s19, s0
More information about the llvm-commits
mailing list