[llvm] [InstCombinePHI] Enhance PHI CSE to remove redundant phis (PR #163453)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 14 15:11:39 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Congzhe (CongzheUalberta)
<details>
<summary>Changes</summary>
Enhanced PHI CSE to eliminate redundant PHIs, which could clean up the IR and open up opportunities for other passes such as loop vectorization.
### Motivation:
Given the following range() function,
```
void range(float *q, float *c, float &vl, float &vr)
{
vl = +1e20;
vr = -1e20;
for (int i = 0; i < 128; i++) {
float tmp = (*q) - (*c);
if (tmp < vl)
vl = tmp;
if (tmp > vr)
vr = tmp;
q++;
c++;
}
return;
}
```
The IR that is right before loop vectorization is shown below. Here `range()` is inlined into its caller and becomes the single BB `for.body`.
```
for.body: ; preds = %entry, %for.body
%v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
%v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
%phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ] (<= redundant, needs clean-up)
%i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
%q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
%c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
%q.load = load float, ptr %q
%c.load = load float, ptr %c
%sub = fsub float %q.load, %c.load
%cmp1 = fcmp olt float %sub, %v0
%v0.1 = select i1 %cmp1, float %sub, float %v0
%same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove (<= redundant, needs clean-up)
%cmp2 = fcmp ogt float %sub, %same.as.v1
%v1.1 = select i1 %cmp2, float %sub, float %v1
%phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1 (<= redundant, needs clean-up)
%inc.i = add nuw nsw i32 %i, 1
%q.next = getelementptr inbounds float, ptr %q, i64 1
%c.next = getelementptr inbounds float, ptr %c, i64 1
%exitcond = icmp eq i32 %inc.i, %count
br i1 %exitcond, label %exit, label %for.body
```
llvm trunk is not able to vectorize it because there are redundant phi (`%phi.to.remove`) and redundant select instructions (`%phi.to.remove.next`, `%same.as.v1`).
Those redundant instructions just act exactly the same as `%v1`, hence they are purely redundant and should be eliminated.
This patch identifies the redundant phi and eliminates it, as a result the loop could get vectorized and performance could get improved.
How the redundant phi was generated:
It was initially introduced by GVN that did load-in-loop-pre, which partially eliminated the load of `%v1` and introduced in one of its predecessors this load `%.pre = load float, ptr %v1`. `%.pre` eventually became the redundant phi that was not cleaned up.
Compiler explorer: https://godbolt.org/z/f4ncn3Kjo
Please refer to the IR before vectorization on main(), and IR before and after GVN on range().
---
Full diff: https://github.com/llvm/llvm-project/pull/163453.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp (+84-5)
- (added) llvm/test/Transforms/InstCombine/enhanced-phi-cse.ll (+61)
``````````diff
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 9815644f5f43d..e736e89a3a146 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -1621,11 +1621,90 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
// Note that even though we've just canonicalized this PHI, due to the
// worklist visitation order, there are no guarantess that *every* PHI
// has been canonicalized, so we can't just compare operands ranges.
- if (!PN.isIdenticalToWhenDefined(&IdenticalPN))
- continue;
- // Just use that PHI instead then.
- ++NumPHICSEs;
- return replaceInstUsesWith(PN, &IdenticalPN);
+ if (PN.isIdenticalToWhenDefined(&IdenticalPN)) {
+ // Just use that PHI instead then.
+ ++NumPHICSEs;
+ return replaceInstUsesWith(PN, &IdenticalPN);
+ }
+
+ // Look for the following pattern and do PHI CSE to clean up the
+ // redundant %phi. Here %phi, %1 and %phi.next perform the same
+ // functionality as %identicalPhi and hence %phi can be eliminated.
+ //
+ // BB1:
+ // %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ]
+ // %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ]
+ // ...
+ // %identicalPhi.next = select %cmp, %val, %identicalPhi
+ // %1 = select %cmp2, %identicalPhi, float %phi
+ // %phi.next = select %cmp, %val, %1
+ //
+ // Prove that %phi and %identicalPhi are the same by induction:
+ //
+ // Base case: Both %phi and %identicalPhi are equal on entry to the loop.
+ // Inductive case:
+ // Suppose %phi and %identicalPhi are equal at iteration i.
+ // We look at their values at iteration i+1 which are %phi.next and
+ // %identicalPhi.next. They would have become different only when %cmp is
+ // false and the corresponding values %1 and %identicalPhi differ.
+ //
+ // The only condition when %1 and %identicalPh could differ is when %cmp2
+ // is false and %1 is %phi, which contradicts our inductive hypothesis
+ // that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are
+ // always equal at iteration i+1.
+
+ if (PN.getNumIncomingValues() == 2 && PN.getNumUses() == 1) {
+ unsigned diffVals = 0;
+ unsigned diffValIdx = 0;
+ // Check that only the backedge incoming value is different.
+ for (unsigned i = 0; i < 2; i++) {
+ if (PN.getIncomingValue(i) != IdenticalPN.getIncomingValue(i)) {
+ diffVals++;
+ diffValIdx = i;
+ }
+ }
+ BasicBlock *CurBB = PN.getParent();
+ if (diffVals == 2 || PN.getIncomingBlock(diffValIdx) != CurBB)
+ continue;
+ // Now check that the backedge incoming values are two select
+ // instructions that are in the same BB, and have the same condition,
+ // true value.
+ auto *Val = PN.getIncomingValue(diffValIdx);
+ auto *IdenticalVal = IdenticalPN.getIncomingValue(diffValIdx);
+ if (!isa<SelectInst>(Val) || !isa<SelectInst>(IdenticalVal))
+ continue;
+
+ auto *SI = cast<SelectInst>(Val);
+ auto *IdenticalSI = cast<SelectInst>(IdenticalVal);
+ if (SI->getParent() != CurBB || IdenticalSI->getParent() != CurBB)
+ continue;
+ if (SI->getCondition() != IdenticalSI->getCondition() ||
+ SI->getTrueValue() != IdenticalSI->getTrueValue())
+ continue;
+
+ // Now check that the false values, i.e., %1 and %identicalPhi,
+ // are essentially the same value within the same BB.
+ auto SameSelAndPhi = [&](SelectInst *SI, PHINode *IdenticalPN,
+ PHINode *PN) {
+ if (SI->getTrueValue() == IdenticalPN) {
+ return SI->getFalseValue() == PN;
+ }
+ return false;
+ };
+ auto *FalseVal = SI->getFalseValue();
+ auto *IdenticalSIFalseVal =
+ dyn_cast<PHINode>(IdenticalSI->getFalseValue());
+ if (!isa<SelectInst>(FalseVal) || !IdenticalSIFalseVal ||
+ IdenticalSIFalseVal != &IdenticalPN)
+ continue;
+ auto *FalseValSI = cast<SelectInst>(FalseVal);
+ if (FalseValSI->getParent() != CurBB ||
+ !SameSelAndPhi(FalseValSI, &IdenticalPN, &PN))
+ continue;
+
+ ++NumPHICSEs;
+ return replaceInstUsesWith(PN, &IdenticalPN);
+ }
}
// If this is an integer PHI and we know that it has an illegal type, see if
diff --git a/llvm/test/Transforms/InstCombine/enhanced-phi-cse.ll b/llvm/test/Transforms/InstCombine/enhanced-phi-cse.ll
new file mode 100644
index 0000000000000..ae589b7450465
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/enhanced-phi-cse.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -passes=instcombine -instcombine-enhanced-phi-cse=true | FileCheck %s
+ at A = extern_weak global float, align 4
+
+; %phi.to.remove acts the same as %v1, and can be eliminated with PHI CSE.
+define void @enhanced_phi_cse(ptr %m, ptr %n, i32 %count) {
+; CHECK-LABEL: @enhanced_phi_cse(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
+; CHECK-NEXT: [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
+; CHECK-NEXT: [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
+; CHECK-NEXT: [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
+; CHECK-NEXT: [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
+; CHECK-NEXT: [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
+; CHECK-NEXT: [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]]
+; CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT: [[Q_NEXT]] = getelementptr inbounds float, ptr [[Q]], i64 1
+; CHECK-NEXT: [[C_NEXT]] = getelementptr inbounds float, ptr [[C]], i64 1
+; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK: exit:
+; CHECK-NEXT: store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
+ %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+ %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+ %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+ %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
+ %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
+ %q.load = load float, ptr %q
+ %c.load = load float, ptr %c
+ %sub = fsub float %q.load, %c.load
+ %cmp1 = fcmp olt float %sub, %v0
+ %v0.1 = select i1 %cmp1, float %sub, float %v0
+ %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
+ %cmp2 = fcmp ogt float %sub, %same.as.v1
+ %v1.1 = select i1 %cmp2, float %sub, float %v1
+ %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1
+ %inc.i = add nuw nsw i32 %i, 1
+ %q.next = getelementptr inbounds float, ptr %q, i64 1
+ %c.next = getelementptr inbounds float, ptr %c, i64 1
+ %exitcond = icmp eq i32 %inc.i, %count
+ br i1 %exitcond, label %exit, label %for.body
+
+exit:
+ %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+ store float %vl.1.lcssa, ptr @A
+ ret void
+}
``````````
</details>
https://github.com/llvm/llvm-project/pull/163453
More information about the llvm-commits
mailing list