[llvm] a08c773 - [LoopVectorize] Add support for scalable vectorization of induction variables

Tue Mar 30 03:13:51 PDT 2021

Author: David Sherwood
Date: 2021-03-30T11:13:31+01:00
New Revision: a08c7736a771d1be6e3168ff94aa8d7ff4479ef4

URL: https://github.com/llvm/llvm-project/commit/a08c7736a771d1be6e3168ff94aa8d7ff4479ef4
DIFF: https://github.com/llvm/llvm-project/commit/a08c7736a771d1be6e3168ff94aa8d7ff4479ef4.diff

LOG: [LoopVectorize] Add support for scalable vectorization of induction variables

This patch adds support for the vectorization of induction variables when
using scalable vectors, which required the following changes:

1. Removed assert from InnerLoopVectorizer::getStepVector.
2. Modified InnerLoopVectorizer::createVectorIntOrFpInductionPHI to use
   a runtime determined value for VF and removed an assert.
3. Modified InnerLoopVectorizer::buildScalarSteps to work for scalable
   vectors. I did this by calculating the full vector value for each Part
   of the unroll factor (UF) and caching this in the VP state. This means
   that we are always able to extract an arbitrary element from the vector
   if necessary. In addition to this, I also permitted the caching of the
   individual lane values themselves for the known minimum number of elements
   in the same way we do for fixed width vectors. This is a further
   optimisation that improves the code quality since it avoids unnecessary
   extractelement operations when extracting the first lane.
4. Added an assert to InnerLoopVectorizer::widenPHIInstruction, since while
   testing some code paths I noticed this is currently broken for scalable
   vectors.

Various tests to support different cases have been added here:

  Transforms/LoopVectorize/AArch64/sve-inductions.ll

Differential Revision: https://reviews.llvm.org/D98715

Added: 
    llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 43443e4a7fd24..be21f61956baa 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2248,16 +2248,20 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
 
   // Multiply the vectorization factor by the step using integer or
   // floating-point arithmetic as appropriate.
-  Value *ConstVF =
-      getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
-  Value *Mul = Builder.CreateBinOp(MulOp, Step, ConstVF);
+  Type *StepType = Step->getType();
+  if (Step->getType()->isFloatingPointTy())
+    StepType = IntegerType::get(StepType->getContext(),
+                                StepType->getScalarSizeInBits());
+  Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
+  if (Step->getType()->isFloatingPointTy())
+    RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
+  Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
 
   // Create a vector splat to use in the induction update.
   //
   // FIXME: If the step is non-constant, we create the vector splat with
   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
   //        handle a constant vector splat.
-  assert(!VF.isScalable() && "scalable vectors not yet supported.");
   Value *SplatVF = isa<Constant>(Mul)
                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
                        : Builder.CreateVectorSplat(VF, Mul);
@@ -2460,8 +2464,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
                                           Instruction::BinaryOps BinOp) {
   // Create and check the types.
-  assert(isa<FixedVectorType>(Val->getType()) &&
-         "Creation of scalable step vector not yet supported");
   auto *ValVTy = cast<VectorType>(Val->getType());
   ElementCount VLen = ValVTy->getElementCount();
 
@@ -2532,23 +2534,46 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
   // Determine the number of scalars we need to generate for each unroll
   // iteration. If EntryVal is uniform, we only need to generate the first
   // lane. Otherwise, we generate all VF values.
-  unsigned Lanes =
-      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
-          ? 1
-          : VF.getKnownMinValue();
-  assert((!VF.isScalable() || Lanes == 1) &&
-         "Should never scalarize a scalable vector");
+  bool IsUniform =
+      Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
+  unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
   // Compute the scalar steps and save the results in State.
+  Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
+                                     ScalarIVTy->getScalarSizeInBits());
+  Type *VecIVTy = nullptr;
+  Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
+  if (!IsUniform && VF.isScalable()) {
+    VecIVTy = VectorType::get(ScalarIVTy, VF);
+    UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
+    SplatStep = Builder.CreateVectorSplat(VF, Step);
+    SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
+  }
+
   for (unsigned Part = 0; Part < UF; ++Part) {
-    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
-                                         ScalarIVTy->getScalarSizeInBits());
-      Value *StartIdx =
-          createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+    Value *StartIdx0 =
+        createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
+
+    if (!IsUniform && VF.isScalable()) {
+      auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
+      auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
       if (ScalarIVTy->isFloatingPointTy())
-        StartIdx = Builder.CreateSIToFP(StartIdx, ScalarIVTy);
-      StartIdx = Builder.CreateBinOp(
-          AddOp, StartIdx, getSignedIntOrFpConstant(ScalarIVTy, Lane));
+        InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
+      auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
+      auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
+      State.set(Def, Add, Part);
+      recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
+                                            Part);
+      // It's useful to record the lane values too for the known minimum number
+      // of elements so we do those below. This improves the code quality when
+      // trying to extract the first element, for example.
+    }
+
+    if (ScalarIVTy->isFloatingPointTy())
+      StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
+
+    for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+      Value *StartIdx = Builder.CreateBinOp(
+          AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
       // The step returned by `createStepForVF` is a runtime-evaluated value
       // when VF is scalable. Otherwise, it should be folded into a Constant.
       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
@@ -4723,6 +4748,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
   case InductionDescriptor::IK_PtrInduction: {
     // Handle the pointer induction variable case.
     assert(P->getType()->isPointerTy() && "Unexpected type.");
+    assert(!VF.isScalable() && "Currently unsupported for scalable vectors");
 
     if (Cost->isScalarAfterVectorization(P, State.VF)) {
       // This is the normalized GEP that starts counting at zero.

diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
new file mode 100644
index 0000000000000..324a2d144e2fd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
@@ -0,0 +1,233 @@
+; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -loop-vectorize -dce -instcombine < %s -S 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; Test that we can add on the induction variable
+;   for (long long i = 0; i < n; i++) {
+;     a[i] = b[i] + i;
+;   }
+; with an unroll factor (interleave count) of 2.
+
+define void @add_ind64_unrolled(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @add_ind64_unrolled(
+; CHECK-NEXT:  entry:
+; CHECK: vector.body:
+; CHECK-NEXT:  %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
+; CHECK-NEXT:  %[[STEPVEC:.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:  %[[TMP1:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[INDEX]], i32 0
+; CHECK-NEXT:  %[[IDXSPLT:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:  %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[IDXSPLT]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[VSCALE:.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:  %[[EC:.*]] = shl i64 %[[VSCALE]], 1
+; CHECK-NEXT:  %[[TMP2:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[EC]], i32 0
+; CHECK-NEXT:  %[[ECSPLT:.*]] = shufflevector <vscale x 2 x i64> %[[TMP2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:  %[[TMP3:.*]] = add <vscale x 2 x i64> %[[ECSPLT]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[IDXSPLT]], %[[TMP3]]
+; CHECK:       %[[LOAD1:.*]] = load <vscale x 2 x i64>
+; CHECK:       %[[LOAD2:.*]] = load <vscale x 2 x i64>
+; CHECK:       %[[STOREVAL1:.*]] = add nsw <vscale x 2 x i64> %[[LOAD1]], %[[VECIND1]]
+; CHECK:       %[[STOREVAL2:.*]] = add nsw <vscale x 2 x i64> %[[LOAD2]], %[[VECIND2]]
+; CHECK:       store <vscale x 2 x i64> %[[STOREVAL1]]
+; CHECK:       store <vscale x 2 x i64> %[[STOREVAL2]]
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.08
+  %0 = load i64, i64* %arrayidx, align 8
+  %add = add nsw i64 %0, %i.08
+  %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 %i.08
+  store i64 %add, i64* %arrayidx1, align 8
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0
+
+exit:                                 ; preds = %for.body
+  ret void
+}
+
+
+; Same as above, except we test with a vectorisation factor of (1, scalable)
+
+define void @add_ind64_unrolled_nxv1i64(i64* noalias nocapture %a, i64* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @add_ind64_unrolled_nxv1i64(
+; CHECK-NEXT:  entry:
+; CHECK: vector.body:
+; CHECK-NEXT:  %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
+; CHECK-NEXT:  %[[STEPVEC:.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; CHECK-NEXT:  %[[TMP1:.*]] = insertelement <vscale x 1 x i64> poison, i64 %[[INDEX]], i32 0
+; CHECK-NEXT:  %[[IDXSPLT:.*]] = shufflevector <vscale x 1 x i64> %[[TMP1]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:  %[[VECIND1:.*]] = add <vscale x 1 x i64> %[[IDXSPLT]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[EC:.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:  %[[TMP2:.*]] = insertelement <vscale x 1 x i64> poison, i64 %[[EC]], i32 0
+; CHECK-NEXT:  %[[ECSPLT:.*]] = shufflevector <vscale x 1 x i64> %[[TMP2]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:  %[[TMP3:.*]] = add <vscale x 1 x i64> %[[ECSPLT]], %[[STEPVEC]]
+; CHECK-NEXT:  %[[VECIND2:.*]] = add <vscale x 1 x i64> %[[IDXSPLT]], %[[TMP3]]
+; CHECK:       %[[LOAD1:.*]] = load <vscale x 1 x i64>
+; CHECK:       %[[LOAD2:.*]] = load <vscale x 1 x i64>
+; CHECK:       %[[STOREVAL1:.*]] = add nsw <vscale x 1 x i64> %[[LOAD1]], %[[VECIND1]]
+; CHECK:       %[[STOREVAL2:.*]] = add nsw <vscale x 1 x i64> %[[LOAD2]], %[[VECIND2]]
+; CHECK:       store <vscale x 1 x i64> %[[STOREVAL1]]
+; CHECK:       store <vscale x 1 x i64> %[[STOREVAL2]]
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i64, i64* %b, i64 %i.08
+  %0 = load i64, i64* %arrayidx, align 8
+  %add = add nsw i64 %0, %i.08
+  %arrayidx1 = getelementptr inbounds i64, i64* %a, i64 %i.08
+  store i64 %add, i64* %arrayidx1, align 8
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !9
+
+exit:                                 ; preds = %for.body
+  ret void
+}
+
+
+; Test that we can vectorize a separate induction variable (not used for the branch)
+;   int r = 0;
+;   for (long long i = 0; i < n; i++) {
+;     a[i] = r;
+;     r += 2;
+;   }
+; with an unroll factor (interleave count) of 1.
+
+
+define void @add_unique_ind32(i32* noalias nocapture %a, i64 %n) {
+; CHECK-LABEL: @add_unique_ind32(
+; CHECK:    vector.ph:
+; CHECK:      %[[STEPVEC:.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT: %[[INDINIT:.*]] = shl <vscale x 4 x i32> %[[STEPVEC]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> undef, i32 1, i32 0), <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[INC:.*]] = shl i32 %[[VSCALE]], 3
+; CHECK-NEXT: %[[TMP:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[INC]], i32 0
+; CHECK-NEXT: %[[VECINC:.*]] = shufflevector <vscale x 4 x i32> %[[TMP]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:    vector.body:
+; CHECK:      %[[VECIND:.*]] = phi <vscale x 4 x i32> [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ]
+; CHECK:      store <vscale x 4 x i32> %[[VECIND]]
+; CHECK:      %[[VECINDNXT]] = add <vscale x 4 x i32> %[[VECIND]], %[[VECINC]]
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %r.07 = phi i32 [ %add, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %i.08
+  store i32 %r.07, i32* %arrayidx, align 4
+  %add = add nuw nsw i32 %r.07, 2
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6
+
+exit:                                 ; preds = %for.body
+  ret void
+}
+
+
+; Test that we can vectorize a separate FP induction variable (not used for the branch)
+;   float r = 0;
+;   for (long long i = 0; i < n; i++) {
+;     a[i] = r;
+;     r += 2;
+;   }
+; with an unroll factor (interleave count) of 1.
+
+define void @add_unique_indf32(float* noalias nocapture %a, i64 %n) {
+; CHECK-LABEL: @add_unique_indf32(
+; CHECK:    vector.ph:
+; CHECK:      %[[STEPVEC:.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT: %[[TMP1:.*]] = uitofp <vscale x 4 x i32> %[[STEPVEC]] to <vscale x 4 x float>
+; CHECK-NEXT: %[[TMP2:.*]] = fmul <vscale x 4 x float> %[[TMP1]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: %[[INDINIT:.*]] = fadd <vscale x 4 x float> %[[TMP2]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[TMP3:.*]] = shl i32 %8, 2
+; CHECK-NEXT: %[[TMP4:.*]] = sitofp i32 %[[TMP3]] to float
+; CHECK-NEXT: %[[INC:.*]] = fmul float %[[TMP4]], 2.000000e+00
+; CHECK-NEXT: %[[TMP5:.*]] = insertelement <vscale x 4 x float> poison, float %[[INC]], i32 0
+; CHECK-NEXT: %[[VECINC:.*]] = shufflevector <vscale x 4 x float> %[[TMP5]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK:   vector.body:
+; CHECK:     %[[VECIND:.*]] = phi <vscale x 4 x float> [ %[[INDINIT]], %vector.ph ], [ %[[VECINDNXT:.*]], %vector.body ]
+; CHECK:     store <vscale x 4 x float> %[[VECIND]]
+; CHECK:     %[[VECINDNXT]] = fadd <vscale x 4 x float> %[[VECIND]], %[[VECINC]]
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.08 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %r.07 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %i.08
+  store float %r.07, float* %arrayidx, align 4
+  %add = fadd float %r.07, 2.000000e+00
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6
+
+exit:                                 ; preds = %for.body
+  ret void
+}
+
+; Test a case where the vectorised induction variable is used to
+; generate a mask:
+;   for (long long i = 0; i < n; i++) {
+;     if (i & 0x1)
+;       a[i] = b[i];
+;   }
+
+define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
+; CHECK-LABEL: @cond_ind64(
+; CHECK:    vector.body:
+; CHECK-NEXT: %[[INDEX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
+; CHECK:      %[[STEPVEC:.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT: %[[TMP1:.*]] = insertelement <vscale x 4 x i64> poison, i64 %[[INDEX]], i32 0
+; CHECK-NEXT: %[[IDXSPLT:.*]] = shufflevector <vscale x 4 x i64> %[[TMP1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: %[[VECIND:.*]] = add <vscale x 4 x i64> %[[IDXSPLT]], %[[STEPVEC]]
+; CHECK-NEXT: %[[MASK:.*]] = trunc <vscale x 4 x i64> %[[VECIND]] to <vscale x 4 x i1>
+; CHECK:      %[[LOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %{{.*}}, i32 4, <vscale x 4 x i1> %[[MASK]], <vscale x 4 x i32> poison)
+; CHECK:      call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[LOAD]], <vscale x 4 x i32>* %{{.*}}, i32 4, <vscale x 4 x i1> %[[MASK]])
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.08 = phi i64 [ %inc, %for.inc ], [ 0, %entry ]
+  %and = and i64 %i.08, 1
+  %tobool.not = icmp eq i64 %and, 0
+  br i1 %tobool.not, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32* %b, i64 %i.08
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %i.08
+  store i32 %0, i32* %arrayidx1, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i64 %i.08, 1
+  %exitcond.not = icmp eq i64 %inc, %n
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6
+
+exit:                                 ; preds = %for.inc
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2, !3, !4, !5}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{!"llvm.loop.interleave.count", i32 2}
+!5 = !{!"llvm.loop.vectorize.enable", i1 true}
+!6 = distinct !{!6, !1, !7, !3, !8, !5}
+!7 = !{!"llvm.loop.vectorize.width", i32 4}
+!8 = !{!"llvm.loop.interleave.count", i32 1}
+!9 = distinct !{!9, !1, !10, !3, !4, !5}
+!10 = !{!"llvm.loop.vectorize.width", i32 1}