[llvm] e5b6833 - [WebAssembly] vi8 mul cost modelling. (#175177)

Mon Jan 12 01:25:59 PST 2026

Author: Sam Parker
Date: 2026-01-12T09:25:54Z
New Revision: e5b6833e49d2752466612aa0ac71f185b1d4afc7

URL: https://github.com/llvm/llvm-project/commit/e5b6833e49d2752466612aa0ac71f185b1d4afc7
DIFF: https://github.com/llvm/llvm-project/commit/e5b6833e49d2752466612aa0ac71f185b1d4afc7.diff

LOG: [WebAssembly] vi8 mul cost modelling. (#175177)

We've already optimised these, so update the cost model to reflect it.
And skip the isBeforeLegalize check when lowering i8 muls, because it
then misses the cases where, say v32i8, has been type legalised into 2x
v16i8.

Also explicitly disable memory interleaving for any factor other than
two or four.

Added: 
    

Modified: 
    llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
    llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
    llvm/test/CodeGen/WebAssembly/memory-interleave.ll
    llvm/test/CodeGen/WebAssembly/simd-pr61780.ll
    llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 25571de9384d5..a2557f45395c2 100644

--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3558,9 +3558,8 @@ static SDValue performMulCombine(SDNode *N,
     return Res;
 
   // We don't natively support v16i8 or v8i8 mul, but we do support v8i16. So,
-  // extend them to v8i16. Only do this before legalization in case a narrow
-  // vector is widened and may be simplified later.
-  if (!DCI.isBeforeLegalize() || (VT != MVT::v8i8 && VT != MVT::v16i8))
+  // extend them to v8i16.
+  if (VT != MVT::v8i8 && VT != MVT::v16i8)
     return SDValue();
 
   SDLoc DL(N);

diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index e3d01075ed5e9..434827c689a8c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -58,6 +58,26 @@ InstructionCost WebAssemblyTTIImpl::getArithmeticInstrCost(
     TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
     ArrayRef<const Value *> Args, const Instruction *CxtI) const {
 
+  if (ST->hasSIMD128()) {
+    static const CostTblEntry ArithCostTbl[]{
+        // extmul + (maybe awkward) shuffle
+        {ISD::MUL, MVT::v8i8, 4},
+        // 2x extmul + (okay) shuffle
+        {ISD::MUL, MVT::v16i8, 4},
+        // extmul
+        {ISD::MUL, MVT::v4i16, 1},
+        // extmul
+        {ISD::MUL, MVT::v2i32, 1},
+    };
+    EVT DstVT = TLI->getValueType(DL, Ty);
+    if (DstVT.isSimple()) {
+      int ISD = TLI->InstructionOpcodeToISD(Opcode);
+      if (const auto *Entry =
+              CostTableLookup(ArithCostTbl, ISD, DstVT.getSimpleVT()))
+        return Entry->Cost;
+    }
+  }
+
   InstructionCost Cost =
       BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
           Opcode, Ty, CostKind, Op1Info, Op2Info);
@@ -302,6 +322,9 @@ InstructionCost WebAssemblyTTIImpl::getInterleavedMemoryOpCost(
     if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
       return InstructionCost::getInvalid();
 
+    if (Factor != 2 && Factor != 4)
+      return InstructionCost::getInvalid();
+
     auto *SubVecTy =
         VectorType::get(VecTy->getElementType(),
                         VecTy->getElementCount().divideCoefficientBy(Factor));

diff  --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 5d58ae223da6f..0cde16800ceb5 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -2697,3 +2697,130 @@ for.body:                                         ; preds = %entry, %for.body
   %exitcond.not = icmp eq i32 %inc, %N
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
+; CHECK-NOT: v128.load
+define hidden void @mac_3d_i8(ptr dead_on_unwind noalias writable writeonly sret(%struct.ThreeBytes) align 1 captures(none) %0, ptr noundef readonly captures(none) %1, ptr noundef readonly captures(none) %2, i32 noundef %3) {
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %6, label %12
+
+6:                                                ; preds = %12, %4
+  %7 = phi i8 [ 0, %4 ], [ %34, %12 ]
+  %8 = phi i8 [ 0, %4 ], [ %28, %12 ]
+  %9 = phi i8 [ 0, %4 ], [ %22, %12 ]
+  %10 = getelementptr inbounds nuw i8, ptr %0, i32 2
+  %11 = getelementptr inbounds nuw i8, ptr %0, i32 1
+  store i8 %9, ptr %0, align 1
+  store i8 %8, ptr %11, align 1
+  store i8 %7, ptr %10, align 1
+  ret void
+
+12:                                               ; preds = %4, %12
+  %13 = phi i32 [ %35, %12 ], [ 0, %4 ]
+  %14 = phi i8 [ %22, %12 ], [ 0, %4 ]
+  %15 = phi i8 [ %28, %12 ], [ 0, %4 ]
+  %16 = phi i8 [ %34, %12 ], [ 0, %4 ]
+  %17 = getelementptr inbounds nuw %struct.ThreeBytes, ptr %1, i32 %13
+  %18 = load i8, ptr %17, align 1
+  %19 = getelementptr inbounds nuw %struct.ThreeBytes, ptr %2, i32 %13
+  %20 = load i8, ptr %19, align 1
+  %21 = mul i8 %20, %18
+  %22 = add i8 %21, %14
+  %23 = getelementptr inbounds nuw i8, ptr %17, i32 1
+  %24 = load i8, ptr %23, align 1
+  %25 = getelementptr inbounds nuw i8, ptr %19, i32 1
+  %26 = load i8, ptr %25, align 1
+  %27 = mul i8 %26, %24
+  %28 = add i8 %27, %15
+  %29 = getelementptr inbounds nuw i8, ptr %17, i32 2
+  %30 = load i8, ptr %29, align 1
+  %31 = getelementptr inbounds nuw i8, ptr %19, i32 2
+  %32 = load i8, ptr %31, align 1
+  %33 = mul i8 %32, %30
+  %34 = add i8 %33, %16
+  %35 = add nuw i32 %13, 1
+  %36 = icmp eq i32 %35, %3
+  br i1 %36, label %6, label %12
+}
+
+; CHECK-LABEL: mac_4d_i8
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle	3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle	3, 7, 11, 15, 19, 23, 27, 31, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extmul_low_i8x16_u
+; CHECK: i8x16.shuffle	0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.add
+; CHECK: i8x16.shuffle	2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle	2, 6, 10, 14, 18, 22, 26, 30, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extmul_low_i8x16_u
+; CHECK: i8x16.shuffle	0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.add
+; CHECK: i8x16.shuffle	1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle	1, 5, 9, 13, 17, 21, 25, 29, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extmul_low_i8x16_u
+; CHECK: i8x16.shuffle	0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.add
+; CHECK: i8x16.shuffle	0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.shuffle	0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extmul_low_i8x16_u
+; CHECK: i8x16.shuffle	0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i8x16.add
+define hidden void @mac_4d_i8(ptr dead_on_unwind noalias writable writeonly sret(%struct.FourBytes) align 1 captures(none) initializes((0, 4)) %0, ptr noundef readonly captures(none) %1, ptr noundef readonly captures(none) %2, i32 noundef %3) {
+  store i32 0, ptr %0, align 1
+  %5 = icmp eq i32 %3, 0
+  br i1 %5, label %11, label %6
+
+6:                                                ; preds = %4
+  %7 = getelementptr inbounds nuw i8, ptr %0, i32 1
+  %8 = getelementptr inbounds nuw i8, ptr %0, i32 2
+  %9 = getelementptr inbounds nuw i8, ptr %0, i32 3
+  br label %13
+
+10:                                               ; preds = %13
+  store i8 %30, ptr %7, align 1
+  store i8 %36, ptr %8, align 1
+  store i8 %42, ptr %9, align 1
+  br label %11
+
+11:                                               ; preds = %10, %4
+  %12 = phi i8 [ %24, %10 ], [ 0, %4 ]
+  store i8 %12, ptr %0, align 1
+  ret void
+
+13:                                               ; preds = %6, %13
+  %14 = phi i8 [ 0, %6 ], [ %42, %13 ]
+  %15 = phi i8 [ 0, %6 ], [ %36, %13 ]
+  %16 = phi i8 [ 0, %6 ], [ %30, %13 ]
+  %17 = phi i32 [ 0, %6 ], [ %43, %13 ]
+  %18 = phi i8 [ 0, %6 ], [ %24, %13 ]
+  %19 = getelementptr inbounds nuw %struct.FourBytes, ptr %1, i32 %17
+  %20 = load i8, ptr %19, align 1
+  %21 = getelementptr inbounds nuw %struct.FourBytes, ptr %2, i32 %17
+  %22 = load i8, ptr %21, align 1
+  %23 = mul i8 %22, %20
+  %24 = add i8 %23, %18
+  %25 = getelementptr inbounds nuw i8, ptr %19, i32 1
+  %26 = load i8, ptr %25, align 1
+  %27 = getelementptr inbounds nuw i8, ptr %21, i32 1
+  %28 = load i8, ptr %27, align 1
+  %29 = mul i8 %28, %26
+  %30 = add i8 %29, %16
+  %31 = getelementptr inbounds nuw i8, ptr %19, i32 2
+  %32 = load i8, ptr %31, align 1
+  %33 = getelementptr inbounds nuw i8, ptr %21, i32 2
+  %34 = load i8, ptr %33, align 1
+  %35 = mul i8 %34, %32
+  %36 = add i8 %35, %15
+  %37 = getelementptr inbounds nuw i8, ptr %19, i32 3
+  %38 = load i8, ptr %37, align 1
+  %39 = getelementptr inbounds nuw i8, ptr %21, i32 3
+  %40 = load i8, ptr %39, align 1
+  %41 = mul i8 %40, %38
+  %42 = add i8 %41, %14
+  %43 = add nuw i32 %17, 1
+  %44 = icmp eq i32 %43, %3
+  br i1 %44, label %10, label %13
+}

diff  --git a/llvm/test/CodeGen/WebAssembly/simd-pr61780.ll b/llvm/test/CodeGen/WebAssembly/simd-pr61780.ll
index 99e6e5cc3fd4c..d69117187a261 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-pr61780.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-pr61780.ll
@@ -7,7 +7,7 @@ define void @f(ptr %0, ptr %pr) {
 ; CHECK-NEXT:    .local v128
 ; CHECK-NEXT:  # %bb.0: # %BB
 ; CHECK-NEXT:    local.get 1
-; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    v128.const 0, 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    local.get 0
 ; CHECK-NEXT:    v128.load64_zero 0
@@ -18,9 +18,6 @@ define void @f(ptr %0, ptr %pr) {
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    i32.shr_u
-; CHECK-NEXT:    local.tee 0
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32.mul
 ; CHECK-NEXT:    i8x16.replace_lane 0
 ; CHECK-NEXT:    i32.const 16
 ; CHECK-NEXT:    local.get 2
@@ -28,10 +25,12 @@ define void @f(ptr %0, ptr %pr) {
 ; CHECK-NEXT:    i32.const 1
 ; CHECK-NEXT:    i32.and
 ; CHECK-NEXT:    i32.shr_u
-; CHECK-NEXT:    local.tee 0
-; CHECK-NEXT:    local.get 0
-; CHECK-NEXT:    i32.mul
 ; CHECK-NEXT:    i8x16.replace_lane 1
+; CHECK-NEXT:    local.tee 2
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i16x8.extmul_low_i8x16_u
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
 ; CHECK-NEXT:    v128.store16_lane 0, 0
 ; CHECK-NEXT:    # fallthrough-return
 BB:

diff  --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index 718e03cfa0c67..e60268fe6a087 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -89,17 +89,15 @@ define hidden void @two_ints_vary_op(ptr noalias nocapture noundef writeonly %0,
 }
 
 ; CHECK-LABEL: three_ints
-; CHECK: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 3 at
-; CHECK: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 3 at
 ; CHECK: LV: Scalar loop costs: 16.
-; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: %10 = load i32, ptr %9
-; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: %12 = load i32, ptr %11
-; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32 %25, ptr %26
-; CHECK: LV: Vector loop of width 2 costs: 24.
-; CHECK: LV: Found an estimated cost of 28 for VF 4 For instruction: %10 = load i32, ptr %9
-; CHECK: LV: Found an estimated cost of 28 for VF 4 For instruction: %12 = load i32, ptr %11
-; CHECK: LV: Found an estimated cost of 28 for VF 4 For instruction: store i32 %25, ptr %26
-; CHECK: LV: Vector loop of width 4 costs: 22.
+; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i32, ptr %9
+; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i32, ptr %11
+; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %25, ptr %26
+; CHECK: LV: Vector loop of width 2 costs: 30.
+; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %10 = load i32, ptr %9
+; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %12 = load i32, ptr %11
+; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: store i32 %25, ptr %26
+; CHECK: LV: Vector loop of width 4 costs: 28.
 ; CHECK: LV: Selecting VF: 1
 define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -137,21 +135,19 @@ define hidden void @three_ints(ptr noalias nocapture noundef writeonly %0, ptr n
 }
 
 ; CHECK-LABEL: three_shorts
-; CHECK: Cost of 26 for VF 4: INTERLEAVE-GROUP with factor 3
-; CHECK: Cost of 52 for VF 8: INTERLEAVE-GROUP with factor 3
 ; CHECK: LV: Scalar loop costs: 16.
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i16
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %12 = load i16
 ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %25
 ; CHECK: LV: Vector loop of width 2 costs: 30.
-; CHECK: LV: Found an estimated cost of 26 for VF 4 For instruction: %10 = load i16
-; CHECK: LV: Found an estimated cost of 26 for VF 4 For instruction: %12 = load i16
-; CHECK: LV: Found an estimated cost of 26 for VF 4 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 4 costs: 21.
-; CHECK: LV: Found an estimated cost of 52 for VF 8 For instruction: %10 = load i16
-; CHECK: LV: Found an estimated cost of 52 for VF 8 For instruction: %12 = load i16
-; CHECK: LV: Found an estimated cost of 52 for VF 8 For instruction: store i16 %25
-; CHECK: LV: Vector loop of width 8 costs: 20.
+; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %10 = load i16
+; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: %12 = load i16
+; CHECK: LV: Found an estimated cost of 12 for VF 4 For instruction: store i16 %25
+; CHECK: LV: Vector loop of width 4 costs: 28.
+; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i16
+; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i16
+; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i16 %25
+; CHECK: LV: Vector loop of width 8 costs: 27.
 ; CHECK: LV: Selecting VF: 1
 define hidden void @three_shorts(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -444,13 +440,13 @@ define hidden void @five_shorts(ptr noalias nocapture noundef writeonly %0, ptr
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
 ; CHECK: LV: Vector loop of width 4 costs: 15.
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: %12 = load i8
-; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %13 = mul i8
+; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 9.
+; CHECK: LV: Vector loop of width 8 costs: 4.
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: %12 = load i8
-; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %13 = mul i8
+; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 7.
+; CHECK: LV: Vector loop of width 16 costs: 1.
 ; CHECK: LV: Selecting VF: 16.
 define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -494,13 +490,13 @@ define hidden void @two_bytes_same_op(ptr noalias nocapture noundef writeonly %0
 ; CHECK: LV: Found an estimated cost of 11 for VF 4 For instruction: store i8
 ; CHECK: LV: Vector loop of width 4 costs: 12.
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: %12 = load i8
-; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %13 = mul i8
+; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 6.
+; CHECK: LV: Vector loop of width 8 costs: 3.
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: %12 = load i8
-; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %13 = mul i8
+; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8 %19
-; CHECK: LV: Vector loop of width 16 costs: 4.
+; CHECK: LV: Vector loop of width 16 costs: 1.
 ; CHECK: LV: Selecting VF: 16.
 define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -531,19 +527,17 @@ define hidden void @two_bytes_vary_op(ptr noalias nocapture noundef writeonly %0
 }
 
 ; CHECK-LABEL: three_bytes_same_op
-; CHECK: Cost of 50 for VF 8: INTERLEAVE-GROUP with factor 3 at %10
-; CHECK: Cost of 100 for VF 16: INTERLEAVE-GROUP with factor 3 at %10
 ; CHECK: LV: Scalar loop costs: 16.
 ; CHECK: LV: Vector loop of width 2 costs: 30.
 ; CHECK: LV: Vector loop of width 4 costs: 28.
-; CHECK: LV: Found an estimated cost of 50 for VF 8 For instruction: %10 = load i8, ptr %9
-; CHECK: LV: Found an estimated cost of 50 for VF 8 For instruction: %12 = load i8, ptr %11
-; CHECK: LV: Found an estimated cost of 50 for VF 8 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 8 costs: 19.
-; CHECK: LV: Found an estimated cost of 100 for VF 16 For instruction: %10 = load i8, ptr %9
-; CHECK: LV: Found an estimated cost of 100 for VF 16 For instruction: %12 = load i8, ptr %11
-; CHECK: LV: Found an estimated cost of 100 for VF 16 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 16 costs: 19.
+; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i8, ptr %9
+; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i8, ptr %11
+; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i8 %25
+; CHECK: LV: Vector loop of width 8 costs: 27.
+; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %10 = load i8, ptr %9
+; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %12 = load i8, ptr %11
+; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: store i8 %25
+; CHECK: LV: Vector loop of width 16 costs: 27.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -581,19 +575,17 @@ define hidden void @three_bytes_same_op(ptr noalias nocapture noundef writeonly
 }
 
 ; CHECK-LABEL: three_bytes_interleave_op
-; CHECK: Cost of 50 for VF 8: INTERLEAVE-GROUP with factor 3 at %10, ir<%9>
-; CHECK: Cost of 100 for VF 16: INTERLEAVE-GROUP with factor 3 at %10, ir<%9>
 ; CHECK: LV: Scalar loop costs: 16.
 ; CHECK: LV: Vector loop of width 2 costs: 30.
 ; CHECK: LV: Vector loop of width 4 costs: 28.
-; CHECK: LV: Found an estimated cost of 50 for VF 8 For instruction: %10 = load i8, ptr %9
-; CHECK: LV: Found an estimated cost of 50 for VF 8 For instruction: %12 = load i8, ptr %11
-; CHECK: LV: Found an estimated cost of 50 for VF 8 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 8 costs: 19.
-; CHECK: LV: Found an estimated cost of 100 for VF 16 For instruction: %10 = load i8, ptr %9
-; CHECK: LV: Found an estimated cost of 100 for VF 16 For instruction: %12 = load i8, ptr %11
-; CHECK: LV: Found an estimated cost of 100 for VF 16 For instruction: store i8 %25
-; CHECK: LV: Vector loop of width 16 costs: 19.
+; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %10 = load i8, ptr %9
+; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %12 = load i8, ptr %11
+; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: store i8 %25
+; CHECK: LV: Vector loop of width 8 costs: 27.
+; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %10 = load i8, ptr %9
+; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %12 = load i8, ptr %11
+; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: store i8 %25
+; CHECK: LV: Vector loop of width 16 costs: 27.
 ; CHECK: LV: Selecting VF: 1.
 define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -704,14 +696,14 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly %
 ; CHECK: LV: Vector loop of width 4 costs: 21.
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %12 = load i8
-; CHECK: LV: Found an estimated cost of 24 for VF 8 For instruction: %13 = mul i8
+; CHECK: LV: Found an estimated cost of 4 for VF 8 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: store i8
-; CHECK: LV: Vector loop of width 8 costs: 16.
+; CHECK: LV: Vector loop of width 8 costs: 11.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %12 = load i8
-; CHECK: LV: Found an estimated cost of 48 for VF 16 For instruction: %13 = mul i8
+; CHECK: LV: Found an estimated cost of 4 for VF 16 For instruction: %13 = mul i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: store i8
-; CHECK: LV: Vector loop of width 16 costs: 31
+; CHECK: LV: Vector loop of width 16 costs: 25
 ; CHECK: LV: Selecting VF: 8
 define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0
@@ -829,11 +821,11 @@ define hidden void @four_bytes_interleave_op(ptr noalias nocapture noundef write
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 8 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 8 costs: 74
+; CHECK: LV: Vector loop of width 8 costs: 54
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %10 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: %12 = load i8
 ; CHECK: LV: Found an estimated cost of 264 for VF 16 For instruction: store i8 %55
-; CHECK: LV: Vector loop of width 16 costs: 73
+; CHECK: LV: Vector loop of width 16 costs: 51
 ; CHECK: LV: Selecting VF: 1
 define hidden void @eight_bytes_same_op(ptr noalias nocapture noundef writeonly %0, ptr nocapture noundef readonly %1, ptr nocapture noundef readonly %2, i32 noundef %3) {
   %5 = icmp eq i32 %3, 0