[llvm] [AMDGPU] Reland: Do not use original PHIs in coercion chains (PR #98985)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 15 22:36:14 PDT 2024
https://github.com/jrbyrnes updated https://github.com/llvm/llvm-project/pull/98985
>From 57eb97caa3dabc824e630296beb8fd8b6c6de2dd Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Mon, 15 Jul 2024 19:46:07 -0700
Subject: [PATCH 1/2] [AMDGPU] Reland: Do not use original PHIs in coercion
chains
Change-Id: I579b5c69a85997f168ed35354b326524b6f84ef7
---
.../AMDGPU/AMDGPULateCodeGenPrepare.cpp | 46 ++++++++---
llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll | 76 +++++++++++++++++++
2 files changed, 113 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 2cc95f81d2f94..eade79dd3bfeb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -365,13 +365,33 @@ bool LiveRegOptimizer::optimizeLiveType(
else
MissingIncVal = true;
}
- Instruction *DeadInst = Phi;
if (MissingIncVal) {
- DeadInst = cast<Instruction>(ValMap[Phi]);
- // Do not use the dead phi
- ValMap[Phi] = Phi;
+ Value *DeadVal = ValMap[Phi];
+ // The coercion chain of the PHI is broken. Delete the Phi
+ // from the ValMap and any connected / user Phis.
+ SmallVector<Value *, 4> PHIWorklist;
+ SmallPtrSet<Value *, 4> Visited;
+ PHIWorklist.push_back(DeadVal);
+ while (!PHIWorklist.empty()) {
+ Value *NextDeadValue = PHIWorklist.pop_back_val();
+ Visited.insert(NextDeadValue);
+ auto OriginalPhi =
+ std::find_if(PhiNodes.begin(), PhiNodes.end(),
+ [this, &NextDeadValue](PHINode *CandPhi) {
+ return ValMap[CandPhi] == NextDeadValue;
+ });
+ assert(OriginalPhi != PhiNodes.end());
+ ValMap.erase(*OriginalPhi);
+ DeadInsts.emplace_back(cast<Instruction>(NextDeadValue));
+
+ for (User *U : NextDeadValue->users()) {
+ if (!Visited.contains(cast<PHINode>(U)))
+ PHIWorklist.push_back(U);
+ }
+ }
+ } else {
+ DeadInsts.emplace_back(cast<Instruction>(Phi));
}
- DeadInsts.emplace_back(DeadInst);
}
// Coerce back to the original type and replace the uses.
for (Instruction *U : Uses) {
@@ -384,10 +404,18 @@ bool LiveRegOptimizer::optimizeLiveType(
NewVal = BBUseValMap[U->getParent()][ValMap[Op]];
else {
BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
- NewVal =
- convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
- InsertPt, U->getParent());
- BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
+ // We may pick up ops that were previously converted for users in
+ // other blocks. If there is a originally typed definition of the Op
+ // already in this block, simply reuse it.
+ if (isa<Instruction>(Op) && !isa<PHINode>(Op) &&
+ U->getParent() == cast<Instruction>(Op)->getParent()) {
+ NewVal = Op;
+ } else {
+ NewVal =
+ convertFromOptType(Op->getType(), cast<Instruction>(ValMap[Op]),
+ InsertPt, U->getParent());
+ BBUseValMap[U->getParent()][ValMap[Op]] = NewVal;
+ }
}
assert(NewVal);
U->setOperand(OpIdx, NewVal);
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
index 5d2e299aa854a..8d27e09562651 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll
@@ -349,4 +349,80 @@ bb.2:
ret void
}
+; Should not produce a broken phi
+
+define void @broken_phi() {
+; GFX906-LABEL: define void @broken_phi(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT: bb:
+; GFX906-NEXT: br label [[BB1:%.*]]
+; GFX906: bb1:
+; GFX906-NEXT: [[I:%.*]] = phi <4 x i8> [ <i8 1, i8 1, i8 1, i8 1>, [[BB:%.*]] ], [ [[I8:%.*]], [[BB7:%.*]] ]
+; GFX906-NEXT: br i1 false, label [[BB3:%.*]], label [[BB2:%.*]]
+; GFX906: bb2:
+; GFX906-NEXT: br label [[BB3]]
+; GFX906: bb3:
+; GFX906-NEXT: [[I4:%.*]] = phi <4 x i8> [ zeroinitializer, [[BB2]] ], [ [[I]], [[BB1]] ]
+; GFX906-NEXT: br i1 false, label [[BB7]], label [[BB5:%.*]]
+; GFX906: bb5:
+; GFX906-NEXT: [[I6:%.*]] = call <4 x i8> @llvm.smax.v4i8(<4 x i8> [[I4]], <4 x i8> zeroinitializer)
+; GFX906-NEXT: br label [[BB7]]
+; GFX906: bb7:
+; GFX906-NEXT: [[I8]] = phi <4 x i8> [ zeroinitializer, [[BB5]] ], [ zeroinitializer, [[BB3]] ]
+; GFX906-NEXT: br label [[BB1]]
+;
+bb:
+ br label %bb1
+bb1:
+ %i = phi <4 x i8> [ <i8 1, i8 1, i8 1, i8 1>, %bb ], [ %i8, %bb7 ]
+ br i1 false, label %bb3, label %bb2
+bb2:
+ br label %bb3
+bb3:
+ %i4 = phi <4 x i8> [ zeroinitializer, %bb2 ], [ %i, %bb1 ]
+ br i1 false, label %bb7, label %bb5
+bb5:
+ %i6 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> %i4, <4 x i8> zeroinitializer)
+ br label %bb7
+bb7:
+ %i8 = phi <4 x i8> [ zeroinitializer, %bb5 ], [ zeroinitializer, %bb3 ]
+ br label %bb1
+}
+
+; %sel1 should just use %sel0 instead of trying to convert back the
+; converted version of %sel0
+
+define amdgpu_kernel void @reuseOp() {
+; GFX906-LABEL: define amdgpu_kernel void @reuseOp(
+; GFX906-SAME: ) #[[ATTR0]] {
+; GFX906-NEXT: entry:
+; GFX906-NEXT: [[VEC1:%.*]] = insertelement <16 x i8> zeroinitializer, i8 0, i64 0
+; GFX906-NEXT: [[VEC1_BC:%.*]] = bitcast <16 x i8> [[VEC1]] to <4 x i32>
+; GFX906-NEXT: br label [[BB_1:%.*]]
+; GFX906: bb.1:
+; GFX906-NEXT: [[VEC1_BC_BC:%.*]] = bitcast <4 x i32> [[VEC1_BC]] to <16 x i8>
+; GFX906-NEXT: [[SEL0:%.*]] = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+; GFX906-NEXT: [[SEL0_BC:%.*]] = bitcast <16 x i8> [[SEL0]] to <4 x i32>
+; GFX906-NEXT: [[SEL1:%.*]] = select i1 false, <16 x i8> [[VEC1_BC_BC]], <16 x i8> [[SEL0]]
+; GFX906-NEXT: br label [[BB_2:%.*]]
+; GFX906: bb.2:
+; GFX906-NEXT: [[SEL0_BC_BC:%.*]] = bitcast <4 x i32> [[SEL0_BC]] to <16 x i8>
+; GFX906-NEXT: [[VAL:%.*]] = extractelement <16 x i8> [[SEL0_BC_BC]], i64 0
+; GFX906-NEXT: ret void
+;
+entry:
+ %vec1 = insertelement <16 x i8> zeroinitializer, i8 0, i64 0
+ br label %bb.1
+
+bb.1:
+ %sel0 = select i1 false, <16 x i8> zeroinitializer, <16 x i8> zeroinitializer
+ %sel1 = select i1 false, <16 x i8> %vec1, <16 x i8> %sel0
+ br label %bb.2
+
+bb.2:
+ %val = extractelement <16 x i8> %sel0, i64 0
+ ret void
+}
+
+
declare i32 @llvm.amdgcn.workitem.id.x()
>From 0e5aa175a90519b1620c511a0fe5ff381e2338ab Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes at amd.com>
Date: Mon, 15 Jul 2024 22:36:07 -0700
Subject: [PATCH 2/2] Update
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index eade79dd3bfeb..054e1eca14006 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -405,7 +405,7 @@ bool LiveRegOptimizer::optimizeLiveType(
else {
BasicBlock::iterator InsertPt = U->getParent()->getFirstNonPHIIt();
// We may pick up ops that were previously converted for users in
- // other blocks. If there is a originally typed definition of the Op
+ // other blocks. If there is an originally typed definition of the Op
// already in this block, simply reuse it.
if (isa<Instruction>(Op) && !isa<PHINode>(Op) &&
U->getParent() == cast<Instruction>(Op)->getParent()) {
More information about the llvm-commits
mailing list