[llvm] [VPlan] Don't rely on region check in isUniformAfterVectorization. (PR #137883)

Wed Apr 30 13:49:33 PDT 2025

https://github.com/fhahn updated https://github.com/llvm/llvm-project/pull/137883

>From 1d63029152f03ec9bc225f395128c24268d11817 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 29 Apr 2025 21:49:46 +0100
Subject: [PATCH 1/2] [VPlan] Don't rely on region check in
 isUniformAfterVectorization.

Generalize isUniformAfterVectorization check to not rely on the region,
but purely work on checking operands and opcodes.

This will be needed when disolving the vector region
(https://github.com/llvm/llvm-project/pull/117506) and improves
codegen slightly in some cases.
---
 llvm/lib/Transforms/Vectorize/VPlanUtils.h    | 27 ++++++++++++++++---
 .../AArch64/conditional-branches-cost.ll      |  6 ++---
 .../uniform_across_vf_induction1.ll           |  6 ++---
 .../uniform_across_vf_induction1_and.ll       | 16 +++++------
 .../uniform_across_vf_induction1_div_urem.ll  |  6 ++---
 .../uniform_across_vf_induction1_lshr.ll      | 18 ++++++-------
 .../uniform_across_vf_induction2.ll           |  6 ++---
 7 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index b0dc2398ea8c4..030f574dcef29 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -39,18 +39,37 @@ const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
 
 /// Returns true if \p VPV is uniform after vectorization.
 inline bool isUniformAfterVectorization(const VPValue *VPV) {
+  auto IsKnownUniformOpcode = [](auto *R) -> bool {
+    return Instruction::isBinaryOp(R->getOpcode()) ||
+           Instruction::isCast(R->getOpcode()) ||
+           R->getOpcode() == Instruction::GetElementPtr ||
+           R->getOpcode() == Instruction::ICmp ||
+           R->getOpcode() == Instruction::FCmp;
+  };
+
   // A value defined outside the vector region must be uniform after
   // vectorization inside a vector region.
-  if (VPV->isDefinedOutsideLoopRegions())
+  if (VPV->isLiveIn())
     return true;
-  if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV))
-    return Rep->isUniform();
+  if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
+    const VPRegionBlock *ParentR = Rep->getParent()->getParent();
+    return Rep->isUniform() ||
+           (IsKnownUniformOpcode(Rep) &&
+            (!ParentR || !ParentR->isReplicator()) &&
+            all_of(Rep->operands(), isUniformAfterVectorization));
+  }
+
   if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe>(VPV))
     return all_of(VPV->getDefiningRecipe()->operands(),
                   isUniformAfterVectorization);
+  if (auto *WidenR = dyn_cast<VPWidenRecipe>(VPV)) {
+    return IsKnownUniformOpcode(WidenR) &&
+           all_of(WidenR->operands(), isUniformAfterVectorization);
+  }
   if (auto *VPI = dyn_cast<VPInstruction>(VPV))
     return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
-           ((Instruction::isBinaryOp(VPI->getOpcode()) ||
+           ((IsKnownUniformOpcode(VPI) ||
+             VPI->getOpcode() == VPInstruction::Broadcast ||
              VPI->getOpcode() == VPInstruction::PtrAdd) &&
             all_of(VPI->operands(), isUniformAfterVectorization));
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index b96a768bba24d..f36161703dba5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -557,21 +557,21 @@ define i32 @header_mask_and_invariant_compare(ptr %A, ptr %B, ptr %C, ptr %D, pt
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[TMP8]], i32 1
 ; DEFAULT-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF32:.*]], label %[[PRED_STORE_CONTINUE33:.*]]
 ; DEFAULT:       [[PRED_STORE_IF32]]:
-; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 1
+; DEFAULT-NEXT:    [[TMP13:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
 ; DEFAULT-NEXT:    store i32 [[TMP13]], ptr [[E]], align 4, !alias.scope [[META14]], !noalias [[META16]]
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE33]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE33]]:
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP8]], i32 2
 ; DEFAULT-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF34:.*]], label %[[PRED_STORE_CONTINUE35:.*]]
 ; DEFAULT:       [[PRED_STORE_IF34]]:
-; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
+; DEFAULT-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
 ; DEFAULT-NEXT:    store i32 [[TMP15]], ptr [[E]], align 4, !alias.scope [[META14]], !noalias [[META16]]
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE35]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE35]]:
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = extractelement <4 x i1> [[TMP8]], i32 3
 ; DEFAULT-NEXT:    br i1 [[TMP21]], label %[[PRED_STORE_IF36:.*]], label %[[PRED_STORE_CONTINUE37]]
 ; DEFAULT:       [[PRED_STORE_IF36]]:
-; DEFAULT-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; DEFAULT-NEXT:    [[TMP22:%.*]] = extractelement <4 x i32> [[TMP6]], i32 0
 ; DEFAULT-NEXT:    store i32 [[TMP22]], ptr [[E]], align 4, !alias.scope [[META14]], !noalias [[META16]]
 ; DEFAULT-NEXT:    br label %[[PRED_STORE_CONTINUE37]]
 ; DEFAULT:       [[PRED_STORE_CONTINUE37]]:
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
index 834637b3ca75d..d9f0e180bdaa2 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1.ll
@@ -74,12 +74,12 @@ define void @ld_div2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
index 74694b40f8002..e85cf2471144b 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_and.ll
@@ -74,12 +74,12 @@ define void @ld_and_neg2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[INDEX]], -2
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 42
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -269,15 +269,11 @@ define void @ld_and_neg2_step2_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 42
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0
-; CHECK-NEXT:    store i64 [[TMP8]], ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1
-; CHECK-NEXT:    store i64 [[TMP9]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP6]], align 8
+; CHECK-NEXT:    store i64 [[TMP5]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
index 28a650f1e7b4e..12b7353a7803d 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_div_urem.ll
@@ -290,12 +290,12 @@ define void @ld_div8_urem3(ptr noalias %A, ptr noalias %B) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = urem i64 [[TMP1]], 3
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[TMP4]], 42
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <8 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    store <8 x i64> [[TMP5]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    store <8 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
index 513179e9fe7f0..6b501905c33d1 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction1_lshr.ll
@@ -113,12 +113,12 @@ define void @ld_lshr1_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP0:%.*]] = lshr i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
 ; VF2-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
+; VF2-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP2]], 42
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; VF2-NEXT:    [[TMP3:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP5]], align 8
+; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF2-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -225,12 +225,12 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP1:%.*]] = lshr i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; VF2-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
+; VF2-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 42
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; VF2-NEXT:    [[TMP4:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP4]], ptr [[TMP6]], align 8
+; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF2-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -264,12 +264,12 @@ define void @ld_lshr2_step1_start0_ind1(ptr noalias %A, ptr noalias %B) {
 ; VF4-NEXT:    [[TMP1:%.*]] = lshr i64 [[INDEX]], 2
 ; VF4-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
 ; VF4-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
-; VF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0
+; VF4-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], 42
+; VF4-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
 ; VF4-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; VF4-NEXT:    [[TMP4:%.*]] = add nsw <4 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
 ; VF4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; VF4-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP6]], align 8
+; VF4-NEXT:    store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
 ; VF4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; VF4-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF4-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
index be32d47f159c3..b3af23c49637d 100644
--- a/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
+++ b/llvm/test/Transforms/LoopVectorize/uniform_across_vf_induction2.ll
@@ -157,12 +157,12 @@ define void @ld_div2_step1_start0_ind2(ptr noalias %A, ptr noalias %B) {
 ; VF2-NEXT:    [[TMP4:%.*]] = add i64 [[TMP2]], [[TMP3]]
 ; VF2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]]
 ; VF2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8
-; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP6]], i64 0
+; VF2-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP6]], 42
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP7]], i64 0
 ; VF2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; VF2-NEXT:    [[TMP7:%.*]] = add nsw <2 x i64> [[BROADCAST_SPLAT]], splat (i64 42)
 ; VF2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[INDEX]]
 ; VF2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0
-; VF2-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP9]], align 8
+; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP9]], align 8
 ; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
 ; VF2-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]

>From de47c858c0ae05eb957f8360be16347b668d97a7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 30 Apr 2025 21:23:59 +0100
Subject: [PATCH 2/2] !fixup address latest comments, thanks

---
 llvm/lib/Transforms/Vectorize/VPlanUtils.h | 37 ++++++++++++----------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 030f574dcef29..ca582cb0d710d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -39,38 +39,43 @@ const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
 
 /// Returns true if \p VPV is uniform after vectorization.
 inline bool isUniformAfterVectorization(const VPValue *VPV) {
-  auto IsKnownUniformOpcode = [](auto *R) -> bool {
-    return Instruction::isBinaryOp(R->getOpcode()) ||
-           Instruction::isCast(R->getOpcode()) ||
-           R->getOpcode() == Instruction::GetElementPtr ||
-           R->getOpcode() == Instruction::ICmp ||
-           R->getOpcode() == Instruction::FCmp;
+  auto PreservesUniformity = [](unsigned Opcode) -> bool {
+    if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode))
+      return true;
+    switch (Opcode) {
+    case Instruction::GetElementPtr:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case VPInstruction::Broadcast:
+    case VPInstruction::PtrAdd:
+      return true;
+    default:
+      return false;
+    }
   };
 
-  // A value defined outside the vector region must be uniform after
-  // vectorization inside a vector region.
+  // A live-in must be uniform across the scope of VPlan.
   if (VPV->isLiveIn())
     return true;
+
   if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
-    const VPRegionBlock *ParentR = Rep->getParent()->getParent();
+    const VPRegionBlock *RegionOfR = Rep->getParent()->getParent();
+    if (RegionOfR && RegionOfR->isReplicator())
+      return false;
     return Rep->isUniform() ||
-           (IsKnownUniformOpcode(Rep) &&
-            (!ParentR || !ParentR->isReplicator()) &&
+           (PreservesUniformity(Rep->getOpcode()) &&
             all_of(Rep->operands(), isUniformAfterVectorization));
   }
-
   if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe>(VPV))
     return all_of(VPV->getDefiningRecipe()->operands(),
                   isUniformAfterVectorization);
   if (auto *WidenR = dyn_cast<VPWidenRecipe>(VPV)) {
-    return IsKnownUniformOpcode(WidenR) &&
+    return PreservesUniformity(WidenR->getOpcode()) &&
            all_of(WidenR->operands(), isUniformAfterVectorization);
   }
   if (auto *VPI = dyn_cast<VPInstruction>(VPV))
     return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
-           ((IsKnownUniformOpcode(VPI) ||
-             VPI->getOpcode() == VPInstruction::Broadcast ||
-             VPI->getOpcode() == VPInstruction::PtrAdd) &&
+           (PreservesUniformity(VPI->getOpcode()) &&
             all_of(VPI->operands(), isUniformAfterVectorization));
 
   // VPExpandSCEVRecipes must be placed in the entry and are alway uniform.