[llvm] 8df64ed - [LV] Don't consider IV increments uniform if exit value is used outside.

Sun Jan 12 14:03:45 PST 2025

Author: Florian Hahn
Date: 2025-01-12T22:03:21Z
New Revision: 8df64ed77727ab9b7540819f2fe64379e88a50be

URL: https://github.com/llvm/llvm-project/commit/8df64ed77727ab9b7540819f2fe64379e88a50be
DIFF: https://github.com/llvm/llvm-project/commit/8df64ed77727ab9b7540819f2fe64379e88a50be.diff

LOG: [LV] Don't consider IV increments uniform if exit value is used outside.

In some cases, there might be a chain of uniform instructions producing
the exit value. To generate correct code in all cases, consider the IV
increment not uniform, if there are users outside the loop.

Instead, let VPlan narrow the IV, if possible using the logic from
3ff1d01985752.

Test case from #122602 verified with Alive2:
    https://alive2.llvm.org/ce/z/bA4EGj

Fixes https://github.com/llvm/llvm-project/issues/122496.
Fixes https://github.com/llvm/llvm-project/issues/122602.

Added: 
    llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
    llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
    llvm/test/Transforms/LoopVectorize/iv_outside_user.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index b017b61a45a0c3..d32a463a996c4f 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3806,7 +3806,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
     // uniform after vectorization.
     bool UniformIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+      return I == Ind || Worklist.count(I) ||
              IsVectorizedMemAccessUse(I, IndUpdate);
     });
     if (!UniformIndUpdate)

diff  --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f440bf2eb02237..545d277d7aa018 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -621,28 +621,6 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
       Def->replaceAllUsesWith(Clone);
     }
 
-    // Check if any uniform VPReplicateRecipes using the phi recipe are used by
-    // ExtractFromEnd. Those must be replaced by a regular VPReplicateRecipe to
-    // ensure the final value is available.
-    // TODO: Remove once uniformity analysis is done on VPlan.
-    for (VPUser *U : Users) {
-      auto *ExitIRI = dyn_cast<VPIRInstruction>(U);
-      VPValue *Op;
-      if (!ExitIRI || !match(ExitIRI->getOperand(0),
-                             m_VPInstruction<VPInstruction::ExtractFromEnd>(
-                                 m_VPValue(Op), m_VPValue())))
-        continue;
-      auto *RepR = dyn_cast<VPReplicateRecipe>(Op);
-      if (!RepR || !RepR->isUniform())
-        continue;
-      assert(!RepR->isPredicated() && "RepR must not be predicated");
-      Instruction *I = RepR->getUnderlyingInstr();
-      auto *Clone =
-          new VPReplicateRecipe(I, RepR->operands(), /*IsUniform*/ false);
-      Clone->insertAfter(RepR);
-      RepR->replaceAllUsesWith(Clone);
-    }
-
     // Replace wide pointer inductions which have only their scalars used by
     // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
     if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
index f0641154f85c7b..5e97627e7688b3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/uniform-phi.ll
@@ -51,8 +51,6 @@ for.end:
 
 ; CHECK-LABEL: goo
 ; Check %indvars.iv and %indvars.iv.next are uniform instructions even if they are used outside of loop.
-; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-; CHECK-DAG: LV: Found uniform instruction:   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 ; CHECK-DAG: LV: Found uniform instruction:   %exitcond = icmp eq i64 %indvars.iv, 1599
 
 define i64 @goo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) #0 {

diff  --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
index 6b0c677b56d2c6..3e61546da2cebc 100644
--- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll
@@ -1176,7 +1176,6 @@ e.exit:
 }
 
 ; Test case for https://github.com/llvm/llvm-project/issues/122496.
-; FIXME: Currently an incorrect live-out is used.
 define i32 @iv_ext_used_outside( ptr %dst) {
 ; VEC-LABEL: define i32 @iv_ext_used_outside(
 ; VEC-SAME: ptr [[DST:%.*]]) {
@@ -1186,15 +1185,19 @@ define i32 @iv_ext_used_outside( ptr %dst) {
 ; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VEC:       [[VECTOR_BODY]]:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; VEC-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VEC-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
 ; VEC-NEXT:    [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[DST]], i16 [[TMP0]]
 ; VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP1]], i32 0
 ; VEC-NEXT:    store <2 x i32> zeroinitializer, ptr [[TMP2]], align 4
-; VEC-NEXT:    [[TMP3:%.*]] = add nuw nsw i16 [[TMP0]], 1
+; VEC-NEXT:    [[TMP5:%.*]] = add nuw nsw <2 x i16> [[VEC_IND]], splat (i16 1)
+; VEC-NEXT:    [[TMP3:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0
 ; VEC-NEXT:    [[TMP4:%.*]] = zext nneg i16 [[TMP3]] to i32
-; VEC-NEXT:    [[TMP5:%.*]] = zext nneg i16 [[TMP3]] to i32
+; VEC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1
+; VEC-NEXT:    [[TMP7:%.*]] = zext nneg i16 [[TMP8]] to i32
 ; VEC-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; VEC-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; VEC-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; VEC-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[MIDDLE_BLOCK]]:
@@ -1213,7 +1216,7 @@ define i32 @iv_ext_used_outside( ptr %dst) {
 ; VEC-NEXT:    [[EC:%.*]] = icmp samesign ult i16 [[IV_1]], 128
 ; VEC-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[EXIT]]:
-; VEC-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    [[IV_1_EXT_LCSSA:%.*]] = phi i32 [ [[IV_1_EXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i32 [[IV_1_EXT_LCSSA]]
 ;
 ; INTERLEAVE-LABEL: define i32 @iv_ext_used_outside(
@@ -1274,7 +1277,6 @@ exit:
 }
 
 ; Test case for https://github.com/llvm/llvm-project/issues/122602.
-; FIXME: Currently an incorrect live-out is used.
 define i64 @test_iv_increment_incremented(ptr %dst) {
 ; VEC-LABEL: define i64 @test_iv_increment_incremented(
 ; VEC-SAME: ptr [[DST:%.*]]) {
@@ -1288,8 +1290,9 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
 ; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i32 -1
 ; VEC-NEXT:    store <2 x i16> splat (i16 1), ptr [[TMP2]], align 2
 ; VEC-NEXT:    [[TMP3:%.*]] = add i64 2, -1
+; VEC-NEXT:    [[TMP5:%.*]] = add i64 1, -1
 ; VEC-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], 1
-; VEC-NEXT:    [[TMP5:%.*]] = add i64 [[TMP3]], 1
+; VEC-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 1
 ; VEC-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VEC:       [[MIDDLE_BLOCK]]:
 ; VEC-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -1307,7 +1310,7 @@ define i64 @test_iv_increment_incremented(ptr %dst) {
 ; VEC-NEXT:    [[IV_1_NEXT]] = add i64 [[IV_2_NEXT]], 1
 ; VEC-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], {{!llvm.loop ![0-9]+}}
 ; VEC:       [[EXIT]]:
-; VEC-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; VEC-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_1_NEXT]], %[[LOOP]] ], [ [[TMP6]], %[[MIDDLE_BLOCK]] ]
 ; VEC-NEXT:    ret i64 [[IV_1_NEXT_LCSSA]]
 ;
 ; INTERLEAVE-LABEL: define i64 @test_iv_increment_incremented(

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
new file mode 100644
index 00000000000000..d88b0583ffc042
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalable-iv-outside-user.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -scalable-vectorization=on -force-target-supports-scalable-vectors=true -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck %s
+
+define i32 @iv_live_out_wide(ptr %dst) {
+; CHECK-LABEL: define i32 @iv_live_out_wide(
+; CHECK-SAME: ptr [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[STEP_1:%.*]] = sext i8 0 to i32
+; CHECK-NEXT:    [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 2000, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP2]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 2000, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 2000, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i32 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.stepvector.nxv2i32()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <vscale x 2 x i32> [[TMP7]], splat (i32 1)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i32> zeroinitializer, [[TMP8]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[TMP5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[STEP_2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i32> [ [[INDUCTION]], %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <vscale x 2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-NEXT:    store <vscale x 2 x i16> zeroinitializer, ptr [[TMP11]], align 2
+; CHECK-NEXT:    store <vscale x 2 x i16> zeroinitializer, ptr [[TMP14]], align 2
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i32> [[BROADCAST_SPLAT2]], [[STEP_ADD]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP18:%.*]] = mul i32 [[TMP17]], 2
+; CHECK-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 2 x i32> [[TMP15]], i32 [[TMP19]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 2000, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[E_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i16 0, ptr [[GEP_DST]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]]
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp slt i32 [[IV_NEXT]], 2000
+; CHECK-NEXT:    br i1 [[CMP_I]], label %[[LOOP]], label %[[E_EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[E_EXIT]]:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[IV_NEXT]], %[[LOOP]] ], [ [[TMP20]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %step.1 = sext i8 0 to i32
+  %step.2 = add nsw i32 %step.1, 1
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i32 %iv
+  store i16 0, ptr %gep.dst, align 2
+  %iv.next = add i32 %step.2, %iv
+  %cmp.i = icmp slt i32 %iv.next, 2000
+  br i1 %cmp.i, label %loop, label %e.exit
+
+e.exit:
+  %res = phi i32 [ %iv.next, %loop ]
+  ret i32 %res
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.