[llvm] [VPlan] Merge fcmp uno feeding Or. (PR #167251)

Sun Nov 9 13:24:17 PST 2025

llvmbot wrote:



@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Florian Hahn (fhahn)

<details>
<summary>Changes</summary>

Fold
 or (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
 or (fcmp uno %A, %B), ...

This pattern is generated to check if any vector lane is NaN, and combining multiple compares is beneficial on architectures that have dedicated instructions.

Alive2 Proof: https://alive2.llvm.org/ce/z/vA_aoM

Combine suggested as part of #161735

---
Full diff: https://github.com/llvm/llvm-project/pull/167251.diff


4 Files Affected:

- (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+11) 
- (modified) llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll (+4-8) 
- (modified) llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll (+4-8) 
- (modified) llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll (+2-4) 


``````````diff

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 634df51a12965..74d4f6528d627 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1221,6 +1221,17 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
     }
   }
 
+  // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
+  // This is useful for fmax/fmin without fast-math flags, where we need to
+  // check if any operand is NaN.
+  if (match(Def, m_BinaryOr(m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X),
+                                          m_Deferred(X)),
+                            m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(Y),
+                                          m_Deferred(Y))))) {
+    VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
+    return Def->replaceAllUsesWith(NewCmp);
+  }
+
   // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
   if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
        match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 7e58d9d6a8ec9..f3d649b899686 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -142,14 +142,10 @@ define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
 ; CHECK-NEXT:    [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i1> [[TMP8]], [[TMP14]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = freeze <4 x i1> [[TMP12]]
-; CHECK-NEXT:    [[TMP17:%.*]] = freeze <4 x i1> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index 01fab87209a35..ca6e5bc2d0dcb 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -141,14 +141,10 @@ define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) {
 ; CHECK-NEXT:    [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]])
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i1> [[TMP8]], [[TMP14]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = freeze <4 x i1> [[TMP12]]
-; CHECK-NEXT:    [[TMP17:%.*]] = freeze <4 x i1> [[TMP13]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index e028bec138faf..a4f7631435bb3 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -697,10 +697,8 @@ define float @test_fmax_and_fmax(ptr %src.0, ptr %src.1, i64 %n) {
 ; CHECK-NEXT:    [[TMP2]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i1> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP11]]

``````````

</details>


https://github.com/llvm/llvm-project/pull/167251