[llvm] 13877db - [AMDGPU] Fix shortfalls in WQM marking
Carl Ritson via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 15 05:45:40 PDT 2021
Author: Carl Ritson
Date: 2021-03-15T21:44:15+09:00
New Revision: 13877db2fa859c9753bd87a47ef00cf262516ae7
URL: https://github.com/llvm/llvm-project/commit/13877db2fa859c9753bd87a47ef00cf262516ae7
DIFF: https://github.com/llvm/llvm-project/commit/13877db2fa859c9753bd87a47ef00cf262516ae7.diff
LOG: [AMDGPU] Fix shortfalls in WQM marking
When tracking defined lanes through phi nodes in the live range
graph each branch of the phi must be handled independently.
Also rewrite the marking algorithm to reduce unnecessary
operations.
Previously a shared set of defined lanes was used which caused
marking to stop prematurely. This was observable in existing lit
tests, but test patterns did not cover this detail.
Reviewed By: piotr
Differential Revision: https://reviews.llvm.org/D98614
Added:
Modified:
llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
llvm/test/CodeGen/AMDGPU/wqm.ll
llvm/test/CodeGen/AMDGPU/wqm.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index f2047f2902fd..82b0ed063274 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -318,38 +318,63 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
- if (!UseLRQ.valueIn())
+ const VNInfo *Value = UseLRQ.valueIn();
+ if (!Value)
return;
// Note: this code assumes that lane masks on AMDGPU completely
// cover registers.
+ const LaneBitmask UseLanes =
+ SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
+ : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
+ : LaneBitmask::getNone());
+
+ // Perform a depth-first iteration of the LiveRange graph marking defs.
+ // Stop processing of a given branch when all use lanes have been defined.
+ // The first definition stops processing for a physical register.
+ struct PhiEntry {
+ const VNInfo *Phi;
+ unsigned PredIdx;
+ unsigned VisitIdx;
+ LaneBitmask DefinedLanes;
+
+ PhiEntry(const VNInfo *Phi, unsigned PredIdx, unsigned VisitIdx,
+ LaneBitmask DefinedLanes)
+ : Phi(Phi), PredIdx(PredIdx), VisitIdx(VisitIdx),
+ DefinedLanes(DefinedLanes) {}
+ };
+ SmallSetVector<const VNInfo *, 4> Visited;
+ SmallVector<PhiEntry, 2> PhiStack;
LaneBitmask DefinedLanes;
- LaneBitmask UseLanes;
- if (SubReg) {
- UseLanes = TRI->getSubRegIndexLaneMask(SubReg);
- } else if (Reg.isVirtual()) {
- UseLanes = MRI->getMaxLaneMaskForVReg(Reg);
- }
-
- SmallPtrSet<const VNInfo *, 4> Visited;
- SmallVector<const VNInfo *, 4> ToProcess;
- ToProcess.push_back(UseLRQ.valueIn());
+ unsigned NextPredIdx; // Only used for processing phi nodes
do {
- const VNInfo *Value = ToProcess.pop_back_val();
- Visited.insert(Value);
+ const VNInfo *NextValue = nullptr;
+
+ if (!Visited.count(Value)) {
+ Visited.insert(Value);
+ // On first visit to a phi then start processing first predecessor
+ NextPredIdx = 0;
+ }
if (Value->isPHIDef()) {
- // Need to mark all defs used in the PHI node
+ // Each predecessor node in the phi must be processed as a subgraph
const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
assert(MBB && "Phi-def has no defining MBB");
- for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
- PE = MBB->pred_end();
- PI != PE; ++PI) {
+
+ // Find next predecessor to process
+ unsigned Idx = NextPredIdx;
+ auto PI = MBB->pred_begin() + Idx;
+ auto PE = MBB->pred_end();
+ for (; PI != PE && !NextValue; ++PI, ++Idx) {
if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
if (!Visited.count(VN))
- ToProcess.push_back(VN);
+ NextValue = VN;
}
}
+
+ // If there are more predecessors to process; add phi to stack
+ if (PI != PE)
+ PhiStack.emplace_back(Value, Idx, Visited.size(), DefinedLanes);
} else {
MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
assert(MI && "Def has no defining instruction");
@@ -370,17 +395,20 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
// Record if this instruction defined any of use
HasDef |= Overlap.any();
- // Check if all lanes of use have been defined
+ // Mark any lanes defined
DefinedLanes |= OpLanes;
- if ((DefinedLanes & UseLanes) != UseLanes) {
- // Definition not complete; need to process input value
- LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
- if (const VNInfo *VN = LRQ.valueIn()) {
- if (!Visited.count(VN))
- ToProcess.push_back(VN);
- }
+ }
+
+ // Check if all lanes of use have been defined
+ if ((DefinedLanes & UseLanes) != UseLanes) {
+ // Definition not complete; need to process input value
+ LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
+ if (const VNInfo *VN = LRQ.valueIn()) {
+ if (!Visited.count(VN))
+ NextValue = VN;
}
}
+
// Only mark the instruction if it defines some part of the use
if (HasDef)
markInstruction(*MI, Flag, Worklist);
@@ -389,9 +417,21 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
markInstruction(*MI, Flag, Worklist);
}
}
- } while (!ToProcess.empty());
- assert(!Reg.isVirtual() || ((DefinedLanes & UseLanes) == UseLanes));
+ if (!NextValue && !PhiStack.empty()) {
+ // Reach end of chain; revert to processing last phi
+ PhiEntry &Entry = PhiStack.back();
+ NextValue = Entry.Phi;
+ NextPredIdx = Entry.PredIdx;
+ DefinedLanes = Entry.DefinedLanes;
+ // Rewind visited set to correct state
+ while (Visited.size() > Entry.VisitIdx)
+ Visited.pop_back();
+ PhiStack.pop_back();
+ }
+
+ Value = NextValue;
+ } while (Value);
}
void SIWholeQuadMode::markOperand(const MachineInstr &MI,
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index 20c44ab9d485..7ab0a0fc04f6 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -859,6 +859,10 @@ main_body:
; CHECK-NEXT: ; %entry
; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: s_wqm_b64 exec, exec
+; CHECK: v_mov
+; CHECK: v_mov
+; CHECK: v_mov
+; CHECK: v_mov
; CHECK: s_and_b64 exec, exec, [[LIVE]]
; CHECK: image_store
; CHECK: s_wqm_b64 exec, exec
diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index 2f95ae1fe2b7..0c339ccfcf6a 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -259,3 +259,42 @@ body: |
$vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
...
+
+---
+# Check that WQM marking occurs correctly through phi nodes in live range graph.
+# If not then initial V_MOV will not be in WQM.
+#
+#CHECK-LABEL: name: test_wqm_lr_phi
+#CHECK: COPY $exec
+#CHECK-NEXT: S_WQM
+#CHECK-NEXT: V_MOV_B32_e32 -10
+#CHECK-NEXT: V_MOV_B32_e32 0
+name: test_wqm_lr_phi
+tracksRegLiveness: true
+body: |
+ bb.0:
+ undef %0.sub0:vreg_64 = V_MOV_B32_e32 -10, implicit $exec
+ %0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ %1:sreg_64 = S_GETPC_B64
+ %2:sgpr_256 = S_LOAD_DWORDX8_IMM %1:sreg_64, 32, 0, 0
+
+ bb.1:
+ $vcc = V_CMP_LT_U32_e64 4, 4, implicit $exec
+ S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ %0.sub0:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.3:
+ %0.sub1:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ %3:sgpr_128 = IMPLICIT_DEF
+ %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %2:sgpr_256, %3:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource")
+ $vgpr0 = COPY %4.sub0:vreg_128
+ $vgpr1 = COPY %4.sub1:vreg_128
+ SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
+...
More information about the llvm-commits
mailing list