[llvm] 95800da - [LoopVectorize] Add support for replication of more intrinsics with scalable vectors

Thu Aug 5 07:17:36 PDT 2021

Author: David Sherwood
Date: 2021-08-05T15:17:27+01:00
New Revision: 95800da914938129083df2fa0165c1901909c273

URL: https://github.com/llvm/llvm-project/commit/95800da914938129083df2fa0165c1901909c273
DIFF: https://github.com/llvm/llvm-project/commit/95800da914938129083df2fa0165c1901909c273.diff

LOG: [LoopVectorize] Add support for replication of more intrinsics with scalable vectors

This patch adds more instructions to the Uniforms list, for example certain
intrinsics that are uniform by definition or whose operands are loop invariant.
This list includes:

  1. The intrinsics 'experimental.noalias.scope.decl' and 'sideeffect', which
  are always uniform by definition.
  2. If intrinsics 'lifetime.start', 'lifetime.end' and 'assume' have
  loop invariant input operands then these are also uniform too.

Also, in VPRecipeBuilder::handleReplication we check if an instruction is
uniform based purely on whether or not the instruction lives in the Uniforms
list. However, there are certain cases where calls to some intrinsics can
be effectively treated as uniform too. Therefore, we now also treat the
following cases as uniform for scalable vectors:

  1. If the 'assume' intrinsic's operand is not loop invariant, then we
  are free to treat this as uniform anyway since it's only a performance
  hint. We will get the benefit for the first lane.
  2. When the input pointers for 'lifetime.start' and 'lifetime.end' are loop
  variant then for scalable vectors we assume these still ultimately come
  from the broadcast of an alloca. We do not support scalable vectorisation
  of loops containing alloca instructions, hence the alloca itself would
  be invariant. If the pointer does not come from an alloca then the
  intrinsic itself has no effect.

I have updated the assume test for fixed width, since we now treat it
as uniform:

  Transforms/LoopVectorize/assume.ll

I've also added new scalable vectorisation tests for other intriniscs:

  Transforms/LoopVectorize/scalable-assume.ll
  Transforms/LoopVectorize/scalable-lifetime.ll
  Transforms/LoopVectorize/scalable-noalias-scope-decl.ll

Differential Revision: https://reviews.llvm.org/D107284

Added: 
    llvm/test/Transforms/LoopVectorize/scalable-assume.ll
    llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
    llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/assume.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0e10fc8bcd05..3c191e076858 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5427,6 +5427,20 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::sideeffect:
+        case Intrinsic::experimental_noalias_scope_decl:
+        case Intrinsic::assume:
+        case Intrinsic::lifetime_start:
+        case Intrinsic::lifetime_end:
+          if (TheLoop->hasLoopInvariantOperands(&I))
+            addToWorklistIfAllowed(&I);
+        default:
+          break;
+        }
+      }
+
       // If there's no pointer operand, there's nothing to do.
       auto *Ptr = getLoadStorePointerOperand(&I);
       if (!Ptr)
@@ -8943,6 +8957,36 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
 
+  // Even if the instruction is not marked as uniform, there are certain
+  // intrinsic calls that can be effectively treated as such, so we check for
+  // them here. Conservatively, we only do this for scalable vectors, since
+  // for fixed-width VFs we can always fall back on full scalarization.
+  if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
+    switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
+    case Intrinsic::assume:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+      // For scalable vectors if one of the operands is variant then we still
+      // want to mark as uniform, which will generate one instruction for just
+      // the first lane of the vector. We can't scalarize the call in the same
+      // way as for fixed-width vectors because we don't know how many lanes
+      // there are.
+      //
+      // The reasons for doing it this way for scalable vectors are:
+      //   1. For the assume intrinsic generating the instruction for the first
+      //      lane is still be better than not generating any at all. For
+      //      example, the input may be a splat across all lanes.
+      //   2. For the lifetime start/end intrinsics the pointer operand only
+      //      does anything useful when the input comes from a stack object,
+      //      which suggests it should always be uniform. For non-stack objects
+      //      the effect is to poison the object, which still allows us to
+      //      remove the call.
+      IsUniform = true;
+    default:
+      break;
+    }
+  }
+
   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                        IsUniform, IsPredicated);
   setRecipe(I, Recipe);

diff  --git a/llvm/test/Transforms/LoopVectorize/assume.ll b/llvm/test/Transforms/LoopVectorize/assume.ll
index 10cb67fd4e6a..b1cb79efa224 100644
--- a/llvm/test/Transforms/LoopVectorize/assume.ll
+++ b/llvm/test/Transforms/LoopVectorize/assume.ll
@@ -49,12 +49,8 @@ define void @test2(%struct.data* nocapture readonly %d) {
 ; CHECK:       vector.body:
 ; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
 ; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
 ; CHECK:       for.body:
 entry:
   %b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-assume.ll b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
new file mode 100644
index 000000000000..808cb70b5999
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalable-assume.ll
@@ -0,0 +1,111 @@
+; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=2  -S | FileCheck %s
+
+define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
+; CHECK-LABEL: @test1(
+; CHECK:       vector.body:
+; CHECK:         [[FCMP1:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT:    [[FCMP1L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP1]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP1L0]])
+; CHECK-NEXT:    [[FCMP2L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP2]], i32 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[FCMP2L0]])
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.assume(i1 %cmp1)
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare void @llvm.assume(i1) #0
+
+attributes #0 = { nounwind willreturn }
+
+%struct.data = type { float*, float* }
+
+define void @test2(float *%a, float *%b) {
+; CHECK-LABEL: @test2(
+; CHECK:       entry:
+; CHECK:         [[MASKCOND:%.*]] = icmp eq i64 %ptrint1, 0
+; CHECK:         [[MASKCOND4:%.*]] = icmp eq i64 %ptrint2, 0
+; CHECK:       vector.body:
+; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK:         tail call void @llvm.assume(i1 [[MASKCOND4]])
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[MASKCOND4]])
+entry:
+  %ptrint1 = ptrtoint float* %a to i64
+  %maskcond = icmp eq i64 %ptrint1, 0
+  %ptrint2 = ptrtoint float* %b to i64
+  %maskcond4 = icmp eq i64 %ptrint2, 0
+  br label %for.body
+
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  tail call void @llvm.assume(i1 %maskcond)
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, 1.000000e+00
+  tail call void @llvm.assume(i1 %maskcond4)
+  %arrayidx5 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+; Test case for PR43620. Make sure we can vectorize with predication in presence
+; of assume calls. For now, check that we drop all assumes in predicated blocks
+; in the vector body.
+define void @predicated_assume(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %n) {
+; Check that the vector.body does not contain any assumes.
+; CHECK-LABEL: @predicated_assume(
+; CHECK:       vector.body:
+; CHECK-NOT:     llvm.assume
+; CHECK:       for.body:
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %if.end5
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end5 ]
+  %cmp1 = icmp ult i64 %indvars.iv, 495616
+  br i1 %cmp1, label %if.end5, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %cmp2 = icmp ult i64 %indvars.iv, 991232
+  tail call void @llvm.assume(i1 %cmp2)
+  br label %if.end5
+
+if.end5:                                          ; preds = %for.body, %if.else
+  %x.0 = phi float [ 4.200000e+01, %if.else ], [ 2.300000e+01, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %x.0, %0
+  %arrayidx7 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  store float %mul, float* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp eq i64 %indvars.iv.next, %n
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !llvm.loop !0
+
+for.cond.cleanup:                                 ; preds = %if.end5, %entry
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
new file mode 100644
index 000000000000..eeb5efe5c8de
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalable-lifetime.ll
@@ -0,0 +1,81 @@
+; RUN: opt -S -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure we can vectorize loops which contain lifetime markers.
+
+define void @test(i32 *%d) {
+; CHECK-LABEL: @test(
+; CHECK:      entry:
+; CHECK:        [[ALLOCA:%.*]] = alloca [1024 x i32], align 16
+; CHECK-NEXT:   [[BC:%.*]] = bitcast [1024 x i32]* [[ALLOCA]] to i8*
+; CHECK:      vector.body:
+; CHECK:        call void @llvm.lifetime.end.p0i8(i64 4096, i8* [[BC]])
+; CHECK:        store <vscale x 2 x i32>
+; CHECK:        call void @llvm.lifetime.start.p0i8(i64 4096, i8* [[BC]])
+
+entry:
+  %arr = alloca [1024 x i32], align 16
+  %0 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end, !llvm.loop !0
+
+for.end:
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
+  ret void
+}
+
+; CHECK-LABEL: @testloopvariant(
+; CHECK:      entry:
+; CHECK:        [[ALLOCA:%.*]] = alloca [1024 x i32], align 16
+; CHECK:      vector.ph:
+; CHECK:        [[TMP1:%.*]] = insertelement <vscale x 2 x [1024 x i32]*> poison, [1024 x i32]* %arr, i32 0
+; CHECK-NEXT:   [[SPLAT_ALLOCA:%.*]] = shufflevector <vscale x 2 x [1024 x i32]*> [[TMP1]], <vscale x 2 x [1024 x i32]*> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK:      vector.body:
+; CHECK:        [[BC_ALLOCA:%.*]] = bitcast <vscale x 2 x [1024 x i32]*> [[SPLAT_ALLOCA]] to <vscale x 2 x i8*>
+; CHECK-NEXT:   [[ONE_LIFETIME:%.*]] = extractelement <vscale x 2 x i8*> [[BC_ALLOCA]], i32 0
+; CHECK-NEXT:   call void @llvm.lifetime.end.p0i8(i64 4096, i8* [[ONE_LIFETIME]])
+; CHECK:        store <vscale x 2 x i32>
+; CHECK-NEXT:   call void @llvm.lifetime.start.p0i8(i64 4096, i8* [[ONE_LIFETIME]])
+
+define void @testloopvariant(i32 *%d) {
+entry:
+  %arr = alloca [1024 x i32], align 16
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = getelementptr [1024 x i32], [1024 x i32]* %arr, i32 0, i64 %indvars.iv
+  %1 = bitcast [1024 x i32]* %arr to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
+  %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx, align 8
+  store i32 100, i32* %arrayidx, align 8
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.body, label %for.end, !llvm.loop !0
+
+for.end:
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}

diff  --git a/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll b/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
new file mode 100644
index 000000000000..813dfbaa40b5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/scalable-noalias-scope-decl.ll
@@ -0,0 +1,127 @@
+; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=4 -force-vector-interleave=2  -S | FileCheck %s
+
+define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
+entry:
+  br label %for.body
+
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %cmp1 = fcmp ogt float %0, 1.000000e+02
+  tail call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  %add = fadd float %0, 1.000000e+00
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare void @llvm.experimental.noalias.scope.decl(metadata)
+
+%struct.data = type { float*, float* }
+
+define void @test2(float* %a, float* %b) {
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST:!.*]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST:!.*]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
+entry:
+  %ptrint = ptrtoint float* %b to i64
+  %maskcond = icmp eq i64 %ptrint, 0
+  %ptrint2 = ptrtoint float* %a to i64
+  %maskcond4 = icmp eq i64 %ptrint2, 0
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  tail call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, 1.000000e+00
+  tail call void @llvm.experimental.noalias.scope.decl(metadata !4)
+  %arrayidx5 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %add, float* %arrayidx5, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv, 1599
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !5
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @predicated_noalias_scope_decl(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %n) {
+
+; Check that the vector.body still contains a llvm.experimental.noalias.scope.decl
+
+; CHECK-LABEL: @predicated_noalias_scope_decl(
+; CHECK:   vector.body:
+; CHECK:   call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK:   scalar.ph:
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK:   if.else:
+; CHECK:   call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: }
+
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %if.end5
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end5 ]
+  %cmp1 = icmp ult i64 %indvars.iv, 495616
+  br i1 %cmp1, label %if.end5, label %if.else
+
+if.else:                                          ; preds = %for.body
+  %cmp2 = icmp ult i64 %indvars.iv, 991232
+  tail call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  br label %if.end5
+
+if.end5:                                          ; preds = %for.body, %if.else
+  %x.0 = phi float [ 4.200000e+01, %if.else ], [ 2.300000e+01, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  %0 = load float, float* %arrayidx, align 4
+  %mul = fmul float %x.0, %0
+  %arrayidx7 = getelementptr inbounds float, float* %b, i64 %indvars.iv
+  store float %mul, float* %arrayidx7, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp = icmp eq i64 %indvars.iv.next, %n
+  br i1 %cmp, label %for.cond.cleanup, label %for.body, !llvm.loop !5
+
+for.cond.cleanup:                                 ; preds = %if.end5
+  ret void
+}
+
+!0 = !{ !1 }
+!1 = distinct !{ !1, !2 }
+!2 = distinct !{ !2 }
+!3 = distinct !{ !3, !2 }
+!4 = !{ !3 }
+!5 = distinct !{!5, !6}
+!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+
+; CHECK: [[SCOPE0_LIST]] = !{[[SCOPE0:!.*]]}
+; CHECK: [[SCOPE0]] = distinct !{[[SCOPE0]], [[SCOPE0_DOM:!.*]]}
+; CHECK: [[SCOPE0_DOM]] = distinct !{[[SCOPE0_DOM]]}
+; CHECK: [[SCOPE4_LIST]] = !{[[SCOPE4:!.*]]}
+; CHECK: [[SCOPE4]] = distinct !{[[SCOPE4]], [[SCOPE0_DOM]]}