[llvm] 971a450 - [RISCV] Model vlseg/vsseg in interleaved memory ops

Tue Apr 4 07:05:20 PDT 2023

Author: Luke Lau
Date: 2023-04-04T15:05:14+01:00
New Revision: 971a4501f7f2017816e3365f0008418fd978a9e7

URL: https://github.com/llvm/llvm-project/commit/971a4501f7f2017816e3365f0008418fd978a9e7
DIFF: https://github.com/llvm/llvm-project/commit/971a4501f7f2017816e3365f0008418fd978a9e7.diff

LOG: [RISCV] Model vlseg/vsseg in interleaved memory ops

If the legalized type is a legal interleaved access type (i.e. there's a
supported vlseg/vsseg instruction for it), the interleaved access pass
will pick any interleaved memory op (wide load + shuffles) and lower it
into a vlseg/vsseg intrinsic.

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D146522

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
    llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
    llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 72f2d67369d89..e90d5d8f873ee 100644

--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -436,6 +436,25 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
       getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
   unsigned VF = FVTy->getNumElements() / Factor;
 
+  // Then interleaved memory access pass with lower interleaved memory ops (i.e
+  // a load and store followed by a specific shuffle) to vlseg/vsseg
+  // intrinsics. In those cases then we can treat it as if it's just one (legal)
+  // memory op
+  if (!UseMaskForCond && !UseMaskForGaps &&
+      Factor <= TLI->getMaxSupportedInterleaveFactor()) {
+    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(FVTy);
+    // Need to make sure type has't been scalarized
+    if (LT.second.isFixedLengthVector()) {
+      auto *LegalFVTy = FixedVectorType::get(FVTy->getElementType(),
+                                             LT.second.getVectorNumElements());
+      if (TLI->isLegalInterleavedAccessType(LegalFVTy, Factor, DL)) {
+        InstructionCost LegalMemCost = getMemoryOpCost(
+            Opcode, LegalFVTy, Alignment, AddressSpace, CostKind);
+        return LT.first + LegalMemCost;
+      }
+    }
+  }
+
   // An interleaved load will look like this for Factor=3:
   // %wide.vec = load <12 x i32>, ptr %3, align 4
   // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 8d51c4f284dc8..9fb2510344402 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -152,54 +152,39 @@ exit:
 define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 4 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[P:%.*]], <vscale x 4 x i64> [[TMP10]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP13]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 4 x i64> [[TMP13]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[P]], <vscale x 4 x i64> [[TMP16]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP18]], <vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]]
-; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <6 x i32>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <6 x i32> [[WIDE_VEC]], <6 x i32> poison, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i32> [[STRIDED_VEC]], <i32 1, i32 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i32> [[STRIDED_VEC1]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <2 x i32> [[STRIDED_VEC2]], <i32 3, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 -2
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <6 x i32> [[TMP13]], <6 x i32> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
+; CHECK-NEXT:    store <6 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
@@ -513,19 +498,30 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
+; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]

diff  --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll
index 327d622ce4301..3de06ae1dbc0d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll
@@ -9,25 +9,25 @@ define void @i8_factor_2(ptr %data, i64 %n) {
 entry:
   br label %for.body
 ; VF_2-LABEL: Checking a loop in 'i8_factor_2'
-; VF_2:       Found an estimated cost of 3 for VF 2 For instruction:   %l0 = load i8, ptr %p0, align 1
+; VF_2:       Found an estimated cost of 2 for VF 2 For instruction:   %l0 = load i8, ptr %p0, align 1
 ; VF_2-NEXT:  Found an estimated cost of 0 for VF 2 For instruction:   %l1 = load i8, ptr %p1, align 1
 ; VF_2:       Found an estimated cost of 0 for VF 2 For instruction:   store i8 %a0, ptr %p0, align 1
-; VF_2-NEXT:  Found an estimated cost of 3 for VF 2 For instruction:   store i8 %a1, ptr %p1, align 1
+; VF_2-NEXT:  Found an estimated cost of 2 for VF 2 For instruction:   store i8 %a1, ptr %p1, align 1
 ; VF_4-LABEL: Checking a loop in 'i8_factor_2'
-; VF_4:       Found an estimated cost of 3 for VF 4 For instruction:   %l0 = load i8, ptr %p0, align 1
+; VF_4:       Found an estimated cost of 2 for VF 4 For instruction:   %l0 = load i8, ptr %p0, align 1
 ; VF_4-NEXT:  Found an estimated cost of 0 for VF 4 For instruction:   %l1 = load i8, ptr %p1, align 1
 ; VF_4:       Found an estimated cost of 0 for VF 4 For instruction:   store i8 %a0, ptr %p0, align 1
-; VF_4-NEXT:  Found an estimated cost of 3 for VF 4 For instruction:   store i8 %a1, ptr %p1, align 1
+; VF_4-NEXT:  Found an estimated cost of 2 for VF 4 For instruction:   store i8 %a1, ptr %p1, align 1
 ; VF_8-LABEL: Checking a loop in 'i8_factor_2'
-; VF_8:       Found an estimated cost of 3 for VF 8 For instruction:   %l0 = load i8, ptr %p0, align 1
+; VF_8:       Found an estimated cost of 2 for VF 8 For instruction:   %l0 = load i8, ptr %p0, align 1
 ; VF_8-NEXT:  Found an estimated cost of 0 for VF 8 For instruction:   %l1 = load i8, ptr %p1, align 1
 ; VF_8:       Found an estimated cost of 0 for VF 8 For instruction:   store i8 %a0, ptr %p0, align 1
-; VF_8-NEXT:  Found an estimated cost of 3 for VF 8 For instruction:   store i8 %a1, ptr %p1, align 1
+; VF_8-NEXT:  Found an estimated cost of 2 for VF 8 For instruction:   store i8 %a1, ptr %p1, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_2'
-; VF_16:       Found an estimated cost of 3 for VF 16 For instruction:   %l0 = load i8, ptr %p0, align 1
+; VF_16:       Found an estimated cost of 2 for VF 16 For instruction:   %l0 = load i8, ptr %p0, align 1
 ; VF_16-NEXT:  Found an estimated cost of 0 for VF 16 For instruction:   %l1 = load i8, ptr %p1, align 1
 ; VF_16:       Found an estimated cost of 0 for VF 16 For instruction:   store i8 %a0, ptr %p0, align 1
-; VF_16-NEXT:  Found an estimated cost of 5 for VF 16 For instruction:   store i8 %a1, ptr %p1, align 1
+; VF_16-NEXT:  Found an estimated cost of 2 for VF 16 For instruction:   store i8 %a1, ptr %p1, align 1
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
   %p0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0
@@ -51,26 +51,26 @@ define void @i8_factor_3(ptr %data, i64 %n) {
 entry:
   br label %for.body
 ; VF_2-LABEL: Checking a loop in 'i8_factor_3'
-; VF_2:       Found an estimated cost of 6 for VF 2 For instruction:   %l0 = load i8, ptr %p0, align 1
+; VF_2:       Found an estimated cost of 2 for VF 2 For instruction:   %l0 = load i8, ptr %p0, align 1
 ; VF_2-NEXT:  Found an estimated cost of 0 for VF 2 For instruction:   %l1 = load i8, ptr %p1, align 1
 ; VF_2-NEXT:  Found an estimated cost of 0 for VF 2 For instruction:   %l2 = load i8, ptr %p2, align 1
 ; VF_2:       Found an estimated cost of 0 for VF 2 For instruction:   store i8 %a0, ptr %p0, align 1
 ; VF_2:       Found an estimated cost of 0 for VF 2 For instruction:   store i8 %a1, ptr %p1, align 1
-; VF_2-NEXT:  Found an estimated cost of 6 for VF 2 For instruction:   store i8 %a2, ptr %p2, align 1
+; VF_2-NEXT:  Found an estimated cost of 2 for VF 2 For instruction:   store i8 %a2, ptr %p2, align 1
 ; VF_4-LABEL: Checking a loop in 'i8_factor_3'
-; VF_4:       Found an estimated cost of 12 for VF 4 For instruction:   %l0 = load i8, ptr %p0, align 1
+; VF_4:       Found an estimated cost of 2 for VF 4 For instruction:   %l0 = load i8, ptr %p0, align 1
 ; VF_4-NEXT:  Found an estimated cost of 0 for VF 4 For instruction:   %l1 = load i8, ptr %p1, align 1
 ; VF_4-NEXT:  Found an estimated cost of 0 for VF 4 For instruction:   %l2 = load i8, ptr %p2, align 1
 ; VF_4:       Found an estimated cost of 0 for VF 4 For instruction:   store i8 %a0, ptr %p0, align 1
 ; VF_4:       Found an estimated cost of 0 for VF 4 For instruction:   store i8 %a1, ptr %p1, align 1
-; VF_4-NEXT:  Found an estimated cost of 12 for VF 4 For instruction:   store i8 %a2, ptr %p2, align 1
+; VF_4-NEXT:  Found an estimated cost of 2 for VF 4 For instruction:   store i8 %a2, ptr %p2, align 1
 ; VF_8-LABEL: Checking a loop in 'i8_factor_3'
-; VF_8:       Found an estimated cost of 24 for VF 8 For instruction:   %l0 = load i8, ptr %p0, align 1
+; VF_8:       Found an estimated cost of 2 for VF 8 For instruction:   %l0 = load i8, ptr %p0, align 1
 ; VF_8-NEXT:  Found an estimated cost of 0 for VF 8 For instruction:   %l1 = load i8, ptr %p1, align 1
 ; VF_8-NEXT:  Found an estimated cost of 0 for VF 8 For instruction:   %l2 = load i8, ptr %p2, align 1
 ; VF_8:       Found an estimated cost of 0 for VF 8 For instruction:   store i8 %a0, ptr %p0, align 1
 ; VF_8:       Found an estimated cost of 0 for VF 8 For instruction:   store i8 %a1, ptr %p1, align 1
-; VF_8-NEXT:  Found an estimated cost of 24 for VF 8 For instruction:   store i8 %a2, ptr %p2, align 1
+; VF_8-NEXT:  Found an estimated cost of 2 for VF 8 For instruction:   store i8 %a2, ptr %p2, align 1
 ; VF_16-LABEL: Checking a loop in 'i8_factor_3'
 ; VF_16:       Found an estimated cost of 48 for VF 16 For instruction:   %l0 = load i8, ptr %p0, align 1
 ; VF_16-NEXT:  Found an estimated cost of 0 for VF 16 For instruction:   %l1 = load i8, ptr %p1, align 1