[llvm] [InstCombinePHI] Enhance PHI CSE to remove redundant phis (PR #163453)

Tue Oct 14 15:11:39 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Congzhe (CongzheUalberta)

<details>
<summary>Changes</summary>

Enhanced PHI CSE to eliminate redundant PHIs, which could clean up the IR and open up opportunities for other passes such as loop vectorization.

### Motivation:

Given the following range() function, 

```
void range(float *q, float *c, float &vl, float &vr)
{
    vl = +1e20;
    vr = -1e20;    
    for (int i = 0; i < 128; i++) {
        float tmp = (*q) - (*c);
        if (tmp < vl)
            vl = tmp;
        if (tmp > vr)
            vr = tmp;
        q++;
        c++;
    }
    return;
}
```

The IR that is right before loop vectorization is shown below. Here `range()` is inlined into its caller and becomes the single BB `for.body`. 

```
for.body:                                    ; preds = %entry, %for.body
  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]  (<= redundant, needs clean-up)
  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
  %q.load = load float, ptr %q
  %c.load = load float, ptr %c
  %sub = fsub float %q.load, %c.load
  %cmp1 = fcmp olt float %sub, %v0
  %v0.1 = select i1 %cmp1, float %sub, float %v0
  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove  (<= redundant, needs clean-up)
  %cmp2 = fcmp ogt float  %sub, %same.as.v1
  %v1.1 = select i1 %cmp2, float %sub, float %v1
  %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1  (<= redundant, needs clean-up)
  %inc.i = add nuw nsw i32 %i, 1
  %q.next = getelementptr inbounds float, ptr %q, i64 1
  %c.next = getelementptr inbounds float, ptr %c, i64 1
  %exitcond = icmp eq i32 %inc.i, %count
  br i1 %exitcond, label %exit, label %for.body
```

llvm trunk is not able to vectorize it because there are redundant phi (`%phi.to.remove`) and redundant select instructions (`%phi.to.remove.next`, `%same.as.v1`). 

Those redundant instructions just act exactly the same as `%v1`, hence they are purely redundant and should be eliminated. 
This patch identifies the redundant phi and eliminates it, as a result the loop could get vectorized and performance could get improved.

How the redundant phi was generated:

It was initially introduced by GVN that did load-in-loop-pre, which partially eliminated the load of `%v1` and introduced in one of its predecessors this load `%.pre = load float, ptr %v1`. `%.pre` eventually became the redundant phi that was not cleaned up.

Compiler explorer: https://godbolt.org/z/f4ncn3Kjo
Please refer to the IR before vectorization on main(), and IR before and after GVN on range().

---
Full diff: https://github.com/llvm/llvm-project/pull/163453.diff


2 Files Affected:

- (modified) llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp (+84-5) 
- (added) llvm/test/Transforms/InstCombine/enhanced-phi-cse.ll (+61) 


``````````diff

diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 9815644f5f43d..e736e89a3a146 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -1621,11 +1621,90 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
     // Note that even though we've just canonicalized this PHI, due to the
     // worklist visitation order, there are no guarantess that *every* PHI
     // has been canonicalized, so we can't just compare operands ranges.
-    if (!PN.isIdenticalToWhenDefined(&IdenticalPN))
-      continue;
-    // Just use that PHI instead then.
-    ++NumPHICSEs;
-    return replaceInstUsesWith(PN, &IdenticalPN);
+    if (PN.isIdenticalToWhenDefined(&IdenticalPN)) {
+      // Just use that PHI instead then.
+      ++NumPHICSEs;
+      return replaceInstUsesWith(PN, &IdenticalPN);
+    }
+
+    // Look for the following pattern and do PHI CSE to clean up the
+    // redundant %phi. Here %phi, %1 and %phi.next perform the same
+    // functionality as %identicalPhi and hence %phi can be eliminated.
+    //
+    // BB1:
+    //   %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ]
+    //   %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ]
+    //   ...
+    //   %identicalPhi.next = select %cmp, %val, %identicalPhi
+    //   %1 = select %cmp2, %identicalPhi, float %phi
+    //   %phi.next = select %cmp, %val, %1
+    //
+    // Prove that %phi and %identicalPhi are the same by induction:
+    //
+    // Base case: Both %phi and %identicalPhi are equal on entry to the loop.
+    // Inductive case:
+    // Suppose %phi and %identicalPhi are equal at iteration i.
+    // We look at their values at iteration i+1 which are %phi.next and
+    // %identicalPhi.next. They would have become different only when %cmp is
+    // false and the corresponding values %1 and %identicalPhi differ.
+    //
+    // The only condition when %1 and %identicalPh could differ is when %cmp2
+    // is false and %1 is %phi, which contradicts our inductive hypothesis
+    // that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are
+    // always equal at iteration i+1.
+
+    if (PN.getNumIncomingValues() == 2 && PN.getNumUses() == 1) {
+      unsigned diffVals = 0;
+      unsigned diffValIdx = 0;
+      // Check that only the backedge incoming value is different.
+      for (unsigned i = 0; i < 2; i++) {
+        if (PN.getIncomingValue(i) != IdenticalPN.getIncomingValue(i)) {
+          diffVals++;
+          diffValIdx = i;
+        }
+      }
+      BasicBlock *CurBB = PN.getParent();
+      if (diffVals == 2 || PN.getIncomingBlock(diffValIdx) != CurBB)
+        continue;
+      // Now check that the backedge incoming values are two select
+      // instructions that are in the same BB, and have the same condition,
+      // true value.
+      auto *Val = PN.getIncomingValue(diffValIdx);
+      auto *IdenticalVal = IdenticalPN.getIncomingValue(diffValIdx);
+      if (!isa<SelectInst>(Val) || !isa<SelectInst>(IdenticalVal))
+        continue;
+
+      auto *SI = cast<SelectInst>(Val);
+      auto *IdenticalSI = cast<SelectInst>(IdenticalVal);
+      if (SI->getParent() != CurBB || IdenticalSI->getParent() != CurBB)
+        continue;
+      if (SI->getCondition() != IdenticalSI->getCondition() ||
+          SI->getTrueValue() != IdenticalSI->getTrueValue())
+        continue;
+
+      // Now check that the false values, i.e., %1 and %identicalPhi,
+      // are essentially the same value within the same BB.
+      auto SameSelAndPhi = [&](SelectInst *SI, PHINode *IdenticalPN,
+                               PHINode *PN) {
+        if (SI->getTrueValue() == IdenticalPN) {
+          return SI->getFalseValue() == PN;
+        }
+        return false;
+      };
+      auto *FalseVal = SI->getFalseValue();
+      auto *IdenticalSIFalseVal =
+          dyn_cast<PHINode>(IdenticalSI->getFalseValue());
+      if (!isa<SelectInst>(FalseVal) || !IdenticalSIFalseVal ||
+          IdenticalSIFalseVal != &IdenticalPN)
+        continue;
+      auto *FalseValSI = cast<SelectInst>(FalseVal);
+      if (FalseValSI->getParent() != CurBB ||
+          !SameSelAndPhi(FalseValSI, &IdenticalPN, &PN))
+        continue;
+
+      ++NumPHICSEs;
+      return replaceInstUsesWith(PN, &IdenticalPN);
+    }
   }
 
   // If this is an integer PHI and we know that it has an illegal type, see if
diff --git a/llvm/test/Transforms/InstCombine/enhanced-phi-cse.ll b/llvm/test/Transforms/InstCombine/enhanced-phi-cse.ll
new file mode 100644
index 0000000000000..ae589b7450465
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/enhanced-phi-cse.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -passes=instcombine -instcombine-enhanced-phi-cse=true | FileCheck %s
+ at A = extern_weak global float, align 4
+
+; %phi.to.remove acts the same as %v1, and can be eliminated with PHI CSE.
+define void @enhanced_phi_cse(ptr %m, ptr %n, i32 %count) {
+; CHECK-LABEL: @enhanced_phi_cse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
+; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
+; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds float, ptr [[Q]], i64 1
+; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds float, ptr [[C]], i64 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
+  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
+  %q.load = load float, ptr %q
+  %c.load = load float, ptr %c
+  %sub = fsub float %q.load, %c.load
+  %cmp1 = fcmp olt float %sub, %v0
+  %v0.1 = select i1 %cmp1, float %sub, float %v0
+  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
+  %cmp2 = fcmp ogt float  %sub, %same.as.v1
+  %v1.1 = select i1 %cmp2, float %sub, float %v1
+  %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1
+  %inc.i = add nuw nsw i32 %i, 1
+  %q.next = getelementptr inbounds float, ptr %q, i64 1
+  %c.next = getelementptr inbounds float, ptr %c, i64 1
+  %exitcond = icmp eq i32 %inc.i, %count
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/163453