[llvm] 4d4a60c - [VPlan] Fix LastActiveLane assertion on scalar VF (#167897)

Mon Nov 17 03:03:42 PST 2025

Author: Luke Lau
Date: 2025-11-17T11:03:38Z
New Revision: 4d4a60cde00539db5033dcbc289ed2ec37b9ad8b

URL: https://github.com/llvm/llvm-project/commit/4d4a60cde00539db5033dcbc289ed2ec37b9ad8b
DIFF: https://github.com/llvm/llvm-project/commit/4d4a60cde00539db5033dcbc289ed2ec37b9ad8b.diff

LOG: [VPlan] Fix LastActiveLane assertion on scalar VF (#167897)

For a scalar only VPlan with tail folding, if it has a phi live out then
legalizeAndOptimizeInductions will scalarize the widened canonical IV
feeding into the header mask:

    <x1> vector loop: {
      vector.body:
        EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
        vp<%5> = SCALAR-STEPS vp<%4>, ir<1>, vp<%0>
        EMIT vp<%6> = icmp ule vp<%5>, vp<%3>
        EMIT vp<%index.next> = add nuw vp<%4>, vp<%1>
        EMIT branch-on-count vp<%index.next>, vp<%2>
      No successors
    }
    Successor(s): middle.block

    middle.block:
      EMIT vp<%8> = last-active-lane vp<%6>
      EMIT vp<%9> = extract-lane vp<%8>, vp<%5>
    Successor(s): ir-bb<exit>

The verifier complains about this but this should still generate the
correct last active lane, so this fixes the assert by handling this case
in isHeaderMask. There is a similar pattern already there for
ActiveLaneMask, which also expects a VPScalarIVSteps recipe.

Fixes #167813

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
    llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 7ac4010b7b320..3bc2dfd623777 100644

--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -58,13 +58,21 @@ bool vputils::isHeaderMask(const VPValue *V, const VPlan &Plan) {
 
   VPValue *A, *B;
 
+  auto m_CanonicalScalarIVSteps =
+      m_ScalarIVSteps(m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()),
+                      m_One(), m_Specific(&Plan.getVF()));
+
   if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_One())))
     return B == Plan.getTripCount() &&
-           (match(A,
-                  m_ScalarIVSteps(
-                      m_Specific(Plan.getVectorLoopRegion()->getCanonicalIV()),
-                      m_One(), m_Specific(&Plan.getVF()))) ||
-            IsWideCanonicalIV(A));
+           (match(A, m_CanonicalScalarIVSteps) || IsWideCanonicalIV(A));
+
+  // For scalar plans, the header mask uses the scalar steps.
+  if (match(V, m_ICmp(m_CanonicalScalarIVSteps,
+                      m_Specific(Plan.getBackedgeTakenCount())))) {
+    assert(Plan.hasScalarVFOnly() &&
+           "Non-scalar VF using scalar IV steps for header mask?");
+    return true;
+  }
 
   return match(V, m_ICmp(m_VPValue(A), m_VPValue(B))) && IsWideCanonicalIV(A) &&
          B == Plan.getBackedgeTakenCount();

diff  --git a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
index 3bc5da155b351..277be4666243b 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS
-; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -pass-remarks='loop-vectorize' -disable-output -S 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s
 
 ; These tests are to check that fold-tail procedure produces correct scalar code when
 ; loop-vectorization is only unrolling but not vectorizing.
@@ -141,5 +141,61 @@ for.body:
   %cond = icmp eq ptr %ptr, %ptr2
   br i1 %cond, label %for.cond.cleanup, label %for.body
 }
+
+define i64 @live_out_scalar_vf(i64 %n) {
+; CHECK-LABEL: @live_out_scalar_vf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
+; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[STEP_ADD_3]], i32 3
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[STEP_ADD_3]], i32 2
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[EXITVAL:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[TMP19:%.*]] = phi i64 [ [[EXITVAL]], [[LOOP]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[TMP19]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  ; Need to use a phi otherwise the header mask will use a
+  ; VPWidenCanonicalIVRecipe instead of a VPScalarIVStepsRecipe.
+  %exitval = phi i64 [ 0, %entry ], [ %iv, %loop ]
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %exitval
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-REMARKS: {{.*}}