[llvm] d10a851 - [WebAssembly] Implement more of getCastInstrCost (#164612)

Mon Nov 10 00:07:21 PST 2025

Author: Sam Parker
Date: 2025-11-10T08:07:16Z
New Revision: d10a85167a26e9489f9daf20acc0092d55687b15

URL: https://github.com/llvm/llvm-project/commit/d10a85167a26e9489f9daf20acc0092d55687b15
DIFF: https://github.com/llvm/llvm-project/commit/d10a85167a26e9489f9daf20acc0092d55687b15.diff

LOG: [WebAssembly] Implement more of getCastInstrCost (#164612)

Fill out more information for sign and zero extend and add some truncate
information; however, the primary change is to int/fp conversions. In
particular, fp to (narrow) int appears to be relatively expensive.

Added: 
    

Modified: 
    llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
    llvm/test/CodeGen/WebAssembly/memory-interleave.ll
    llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 92a9812df2127..70f7b889551a4 100644

--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -119,18 +119,82 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
     }
   }
 
-  // extend_low
   static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
+      // extend_low
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
       {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
       {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
+      // 2 x extend_low
       {ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
       {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
       {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
+      // extend_low, extend_high
+      {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
+      {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
+      {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
+      {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
+      {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
+      {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
+      // 2x extend_low, extend_high
+      {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 4},
+      {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4},
+      {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+      {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+      // shuffle
+      {ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2},
+      {ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4},
+      {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 2},
+      {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 4},
+      // narrow, and
+      {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2},
+      {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2},
+      // narrow, 2x and
+      {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3},
+      // 3x narrow, 4x and
+      {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 7},
+      {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7},
+      // 7x narrow, 8x and
+      {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 15},
+      // convert_i32x4
+      {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
+      {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
+      {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
+      {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
+      // extend_low, convert
+      {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
+      {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
+      {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
+      {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
+      // extend_low x 2, convert
+      {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
+      {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
+      {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
+      {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
+      // several shuffles
+      {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+      {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+      {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 10},
+      {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+      /// trunc_sat, const, and, 3x narrow
+      {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 6},
+      {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 6},
+      {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 6},
+      {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 6},
+      /// trunc_sat, const, and, narrow
+      {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 4},
+      {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 4},
+      {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4},
+      {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4},
+      // 2x trunc_sat, const, 2x and, 3x narrow
+      {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 8},
+      {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 8},
+      // 2x trunc_sat, const, 2x and, narrow
+      {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 6},
+      {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 6},
   };
 
   if (const auto *Entry =

diff  --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 404db23ba7329..5d58ae223da6f 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -1720,28 +1720,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: two_floats_two_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: v128.store64_lane
+; CHECK-NOT: v128.load
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp22.not = icmp eq i32 %N, 0
@@ -1774,28 +1753,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: two_floats_two_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.add
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 16, 1, 17, 2, 18, 3, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: v128.store64_lane
+; CHECK-NOT: v128.load
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp21.not = icmp eq i32 %N, 0
@@ -2347,64 +2305,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2453,64 +2354,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle  0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	255, 255, 255, 255
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 0, 0, 0, 0, 0, 0, 0
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.narrow_i16x8_u
-; CHECK: i8x16.shuffle	0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 4, 24, 28, 1, 5, 25, 29, 2, 6, 26, 30, 3, 7, 27, 31
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
@@ -2757,62 +2601,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_shorts_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	65535, 65535, 65535, 65535
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
-; CHECK: v128.store
-; CHECK: i8x16.shuffle	0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2861,62 +2650,7 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: four_floats_four_shorts_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.const	65535, 65535, 65535, 65535
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: i32x4.trunc_sat_f32x4_s
-; CHECK: v128.and
-; CHECK: i16x8.narrow_i32x4_u
-; CHECK: i8x16.shuffle	0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23
-; CHECK: i8x16.shuffle	4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31
-; CHECK: v128.store
-; CHECK: i8x16.shuffle	0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27
-; CHECK: v128.store
+; CHECK-NOT: v128.load
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0

diff  --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index b26e9cf55ddbf..718e03cfa0c67 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -1231,7 +1231,7 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8 %48
-; CHECK: LV: Vector loop of width 8 costs: 10.
+; CHECK: LV: Vector loop of width 8 costs: 11.
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %14 = load i8
 ; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %20 = load i8
 ; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8 %48
@@ -1442,8 +1442,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Vector loop of width 2 costs: 27
+; CHECK: LV: Vector loop of width 4 costs: 15
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1484,8 +1484,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Vector loop of width 2 costs: 27
+; CHECK: LV: Vector loop of width 4 costs: 15
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1526,9 +1526,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 21
-; CHECK: LV: Vector loop of width 4 costs: 14.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 26
+; CHECK: LV: Vector loop of width 4 costs: 16.
+; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp22.not = icmp eq i32 %N, 0
@@ -1566,9 +1566,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 21
-; CHECK: LV: Vector loop of width 4 costs: 14.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 26
+; CHECK: LV: Vector loop of width 4 costs: 16.
+; CHECK: LV: Selecting VF: 1.
 define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp21.not = icmp eq i32 %N, 0
@@ -1608,8 +1608,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 22
-; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Vector loop of width 2 costs: 24
+; CHECK: LV: Vector loop of width 4 costs: 12
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1652,8 +1652,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 22
-; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Vector loop of width 2 costs: 24
+; CHECK: LV: Vector loop of width 4 costs: 12
 ; CHECK: LV: Selecting VF: 4.
 define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1696,9 +1696,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 20
-; CHECK: LV: Vector loop of width 4 costs: 13.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 14
+; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp22.not = icmp eq i32 %N, 0
@@ -1738,9 +1738,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
 ; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 20
-; CHECK: LV: Vector loop of width 4 costs: 13.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 14
+; CHECK: LV: Selecting VF: 4
 define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp21.not = icmp eq i32 %N, 0
@@ -1883,8 +1883,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 43
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 51
+; CHECK: LV: Vector loop of width 4 costs: 27
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -1943,8 +1943,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 43
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 51
+; CHECK: LV: Vector loop of width 4 costs: 27
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2004,9 +2004,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 38
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 48
+; CHECK: LV: Vector loop of width 4 costs: 31
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2061,9 +2061,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 38
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 48
+; CHECK: LV: Vector loop of width 4 costs: 31
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0
@@ -2119,8 +2119,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 37
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 25
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2181,8 +2181,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 37
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 25
 ; CHECK: LV: Selecting VF: 4
 define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
@@ -2243,9 +2243,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 35
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 29
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp48.not = icmp eq i32 %N, 0
@@ -2301,9 +2301,9 @@ for.body:                                         ; preds = %entry, %for.body
 ; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
 ; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 35
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 29
+; CHECK: LV: Selecting VF: 1
 define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
 entry:
   %cmp45.not = icmp eq i32 %N, 0