[llvm] 8181dcd - [AMDGPU] WQM/WWM: Fix marking of partial definitions
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Fri Feb 19 03:47:15 PST 2021
Author: Carl Ritson
Date: 2021-02-19T20:45:24+09:00
New Revision: 8181dcd30feeb7258685cafc35e0b01c894c6f3b
URL: https://github.com/llvm/llvm-project/commit/8181dcd30feeb7258685cafc35e0b01c894c6f3b
DIFF: https://github.com/llvm/llvm-project/commit/8181dcd30feeb7258685cafc35e0b01c894c6f3b.diff
LOG: [AMDGPU] WQM/WWM: Fix marking of partial definitions
Track lanes when processing definitions for marking WQM/WWM.
If all lanes have been defined then marking can stop.
This prevents marking unnecessary instructions as WQM/WWM.
In particular this fixes a bug where values passing through
V_SET_INACTIVE would me marked as requiring WWM.
Reviewed By: piotr
Differential Revision: https://reviews.llvm.org/D95503
Added:
Modified:
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
llvm/test/CodeGen/AMDGPU/wqm.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 7c62282012f9..b5614e05b4d2 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -165,6 +165,8 @@ class SIWholeQuadMode : public MachineFunctionPass {
std::vector<WorkItem> &Worklist);
void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
+ void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
+ std::vector<WorkItem> &Worklist);
void markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
@@ -272,8 +274,6 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
assert(!(Flag & StateExact) && Flag != 0);
- LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
-
// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where
@@ -285,6 +285,7 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
if ((II.Needs & Flag) == Flag)
return;
+ LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
II.Needs |= Flag;
Worklist.push_back(&MI);
}
@@ -299,6 +300,16 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
if (!UseLRQ.valueIn())
return;
+ // Note: this code assumes that lane masks on AMDGPU completely
+ // cover registers.
+ LaneBitmask DefinedLanes;
+ LaneBitmask UseLanes;
+ if (SubReg) {
+ UseLanes = TRI->getSubRegIndexLaneMask(SubReg);
+ } else if (Reg.isVirtual()) {
+ UseLanes = MRI->getMaxLaneMaskForVReg(Reg);
+ }
+
SmallPtrSet<const VNInfo *, 4> Visited;
SmallVector<const VNInfo *, 4> ToProcess;
ToProcess.push_back(UseLRQ.valueIn());
@@ -321,64 +332,93 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
} else {
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
assert(MI && "Def has no defining instruction");
- markInstruction(*MI, Flag, Worklist);
- // Iterate over all operands to find relevant definitions
- for (const MachineOperand &Op : MI->operands()) {
- if (!(Op.isReg() && Op.getReg() == Reg))
- continue;
-
- // Does this def cover whole register?
- bool DefinesFullReg =
- Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
- if (!DefinesFullReg) {
- // Partial definition; need to follow and mark input value
- LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
- if (const VNInfo *VN = LRQ.valueIn()) {
- if (!Visited.count(VN))
- ToProcess.push_back(VN);
+ if (Reg.isVirtual()) {
+ // Iterate over all operands to find relevant definitions
+ bool HasDef = false;
+ for (const MachineOperand &Op : MI->operands()) {
+ if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
+ continue;
+
+ // Compute lanes defined and overlap with use
+ LaneBitmask OpLanes =
+ Op.isUndef() ? LaneBitmask::getAll()
+ : TRI->getSubRegIndexLaneMask(Op.getSubReg());
+ LaneBitmask Overlap = (UseLanes & OpLanes);
+
+ // Record if this instruction defined any of use
+ HasDef |= Overlap.any();
+
+ // Check if all lanes of use have been defined
+ DefinedLanes |= OpLanes;
+ if ((DefinedLanes & UseLanes) != UseLanes) {
+ // Definition not complete; need to process input value
+ LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
+ if (const VNInfo *VN = LRQ.valueIn()) {
+ if (!Visited.count(VN))
+ ToProcess.push_back(VN);
+ }
}
}
+ // Only mark the instruction if it defines some part of the use
+ if (HasDef)
+ markInstruction(*MI, Flag, Worklist);
+ } else {
+ // For physical registers simply mark the defining instruction
+ markInstruction(*MI, Flag, Worklist);
}
}
} while (!ToProcess.empty());
-}
-/// Mark all instructions defining the uses in \p MI with \p Flag.
-void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
- std::vector<WorkItem> &Worklist) {
-
- LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
- << MI);
+ assert(!Reg.isVirtual() || ((DefinedLanes & UseLanes) == UseLanes));
+}
- for (const MachineOperand &Use : MI.uses()) {
- if (!Use.isReg() || !Use.isUse())
- continue;
+void SIWholeQuadMode::markOperand(const MachineInstr &MI,
+ const MachineOperand &Op, char Flag,
+ std::vector<WorkItem> &Worklist) {
+ assert(Op.isReg());
+ Register Reg = Op.getReg();
- Register Reg = Use.getReg();
+ // Ignore some hardware registers
+ switch (Reg) {
+ case AMDGPU::EXEC:
+ case AMDGPU::EXEC_LO:
+ return;
+ default:
+ break;
+ }
+ LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
+ << " for " << MI);
+ if (Reg.isVirtual()) {
+ LiveRange &LR = LIS->getInterval(Reg);
+ markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
+ } else {
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
- if (!Reg.isVirtual()) {
- if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
+ for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
+ ++RegUnit) {
+ LiveRange &LR = LIS->getRegUnit(*RegUnit);
+ const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
+ if (!Value)
continue;
- for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
- ++RegUnit) {
- LiveRange &LR = LIS->getRegUnit(*RegUnit);
- const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
- if (!Value)
- continue;
+ markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
+ }
+ }
+}
- markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
- }
+/// Mark all instructions defining the uses in \p MI with \p Flag.
+void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
+ std::vector<WorkItem> &Worklist) {
+ LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
+ << MI);
+ for (const MachineOperand &Use : MI.uses()) {
+ if (!Use.isReg() || !Use.isUse())
continue;
- }
-
- LiveRange &LR = LIS->getInterval(Reg);
- markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
+ markOperand(MI, Use, Flag, Worklist);
}
}
@@ -441,11 +481,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
if (Inactive.isUndef()) {
LowerToCopyInstrs.push_back(&MI);
} else {
- Register Reg = Inactive.getReg();
- if (Reg.isVirtual()) {
- for (MachineInstr &DefMI : MRI->def_instructions(Reg))
- markInstruction(DefMI, StateWWM, Worklist);
- }
+ markOperand(MI, Inactive, StateWWM, Worklist);
}
}
SetInactiveInstrs.push_back(&MI);
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index effac7e9290c..dddc569935d9 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -233,3 +233,29 @@ body: |
$vgpr3 = COPY %9.sub1:vreg_128
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1, $vgpr2, $vgpr3
...
+
+---
+# Check that unnecessary instruction do not get marked for WWM
+#
+#CHECK-NOT: ENTER_WWM
+#CHECK: BUFFER_LOAD_DWORDX2
+#CHECK-NOT: ENTER_WWM
+#CHECK: V_SET_INACTIVE_B32
+#CHECK: V_SET_INACTIVE_B32
+#CHECK: ENTER_WWM
+#CHECK: V_MAX
+name: test_wwm_set_inactive_propagation
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0
+ %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ %1:vgpr_32 = COPY $vgpr0
+ %2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec
+ %2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc
+ %2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc
+ %3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
+ $vgpr0 = WWM %3.sub0:vreg_64, implicit $exec
+ $vgpr1 = WWM %3.sub1:vreg_64, implicit $exec
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
+...
More information about the llvm-commits
mailing list