[llvm] [WebAssembly] Implement more of getCastInstrCost (PR #164612)
Sam Parker via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 22 05:28:08 PDT 2025
https://github.com/sparker-arm created https://github.com/llvm/llvm-project/pull/164612
Fill out more information for sign and zero extend and add some truncate information; however, the primary change is to int/fp conversions. In particular, fp to (narrow) int appears to be rather expensive as we effectively scalarize and rebuild the vector result.
>From 4a2f595748247e2be6cc62afee12c27db2629fe5 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker at arm.com>
Date: Wed, 22 Oct 2025 13:21:36 +0100
Subject: [PATCH] [WebAssembly] Implement more of getCastInstrCost
Fill out more information for sign and zero extend and add some
truncate information; however, the primary change is to int/fp
conversions. In particular, fp to (narrow) int appears to be rather
expensive as we effectively scalarize and rebuild the vector result.
---
.../WebAssemblyTargetTransformInfo.cpp | 63 ++-
.../CodeGen/WebAssembly/memory-interleave.ll | 498 +-----------------
.../WebAssembly/memory-interleave.ll | 74 +--
3 files changed, 107 insertions(+), 528 deletions(-)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 92a9812df2127..69fd7fe38988a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -119,18 +119,79 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
}
}
- // extend_low
static constexpr TypeConversionCostTblEntry ConversionTbl[] = {
+ // extend_low
{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1},
{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1},
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1},
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1},
{ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1},
{ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1},
+ // 2 x extend_low
{ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2},
{ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2},
{ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2},
{ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2},
+ // extend_low, extend_high
+ {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
+ {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
+ {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
+ {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
+ {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
+ {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
+ // 2x extend_low, extend_high
+ {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i32, 4},
+ {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i32, 4},
+ {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+ {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4},
+ // shuffle
+ {ISD::TRUNCATE, MVT::v2i16, MVT::v2i32, 2},
+ {ISD::TRUNCATE, MVT::v2i8, MVT::v2i32, 4},
+ {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 2},
+ {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 4},
+ // narrow, and
+ {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 2},
+ {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 2},
+ // narrow, 2x and
+ {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 3},
+ // 3x narrow, 4x and
+ {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 7},
+ {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 7},
+ // 7x narrow, 8x and
+ {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 15},
+ // convert_i32x4
+ {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
+ {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
+ {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
+ {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
+ // extend_low, convert
+ {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
+ {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2},
+ {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
+ {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
+ // extend_low x 2, convert
+ {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
+ {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
+ {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
+ {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
+ // several shuffles
+ {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+ {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+ {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 10},
+ {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
+ // extract_lane, trunc_sat, replace_lane.
+ {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 6},
+ {ISD::FP_TO_UINT, MVT::v4i8, MVT::v2f32, 6},
+ {ISD::FP_TO_SINT, MVT::v4i16, MVT::v2f32, 6},
+ {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 6},
+ {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 12},
+ {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 12},
+ {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 12},
+ {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 12},
+ {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 24},
+ {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f32, 24},
+ {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 24},
+ {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 24},
};
if (const auto *Entry =
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 104ec31e13d3e..caa20ab140c26 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -1720,42 +1720,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: two_floats_two_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: v128.store64_lane
+; CHECK-NOT: f32x4
define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp22.not = icmp eq i32 %N, 0
@@ -1788,42 +1753,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: two_floats_two_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: v128.store64_lane
+; CHECK-NOT: f32x4
define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp21.not = icmp eq i32 %N, 0
@@ -1966,42 +1896,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: two_floats_two_shorts_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: f32x4
define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp22.not = icmp eq i32 %N, 0
@@ -2034,42 +1929,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: two_floats_two_shorts_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
-; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: f32x4
define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp21.not = icmp eq i32 %N, 0
@@ -2410,92 +2270,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: four_floats_four_bytes_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: f32x4
define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp48.not = icmp eq i32 %N, 0
@@ -2544,92 +2319,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: four_floats_four_bytes_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i8x16.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: f32x4
define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
@@ -2876,93 +2566,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: four_floats_four_shorts_same_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: f32x4
define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp48.not = icmp eq i32 %N, 0
@@ -3011,93 +2615,7 @@ for.body: ; preds = %entry, %for.body
}
; CHECK-LABEL: four_floats_four_shorts_vary_op:
-; CHECK: loop
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: v128.load
-; CHECK: v128.load
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.mul
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.add
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.div
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
-; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
-; CHECK: f32x4.sub
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.splat
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: f32x4.extract_lane
-; CHECK: i32.trunc_sat_f32_s
-; CHECK: i16x8.replace_lane
-; CHECK: v128.store
+; CHECK-NOT: f32x4
define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index e42e2c73d588a..e8776ddc0811a 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -1231,7 +1231,7 @@ define hidden void @scale_uv_row_down2(ptr nocapture noundef readonly %0, i32 no
; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %14 = load i8
; CHECK: LV: Found an estimated cost of 26 for VF 8 For instruction: %20 = load i8
; CHECK: LV: Found an estimated cost of 7 for VF 8 For instruction: store i8 %48
-; CHECK: LV: Vector loop of width 8 costs: 10.
+; CHECK: LV: Vector loop of width 8 costs: 11.
; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %14 = load i8
; CHECK: LV: Found an estimated cost of 132 for VF 16 For instruction: %20 = load i8
; CHECK: LV: Found an estimated cost of 6 for VF 16 For instruction: store i8 %48
@@ -1442,8 +1442,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Vector loop of width 2 costs: 27
+; CHECK: LV: Vector loop of width 4 costs: 15
; CHECK: LV: Selecting VF: 4.
define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
@@ -1484,8 +1484,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 23
-; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Vector loop of width 2 costs: 27
+; CHECK: LV: Vector loop of width 4 costs: 15
; CHECK: LV: Selecting VF: 4.
define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
@@ -1526,9 +1526,9 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 21
-; CHECK: LV: Vector loop of width 4 costs: 14.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 26
+; CHECK: LV: Vector loop of width 4 costs: 19.
+; CHECK: LV: Selecting VF: 1.
define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp22.not = icmp eq i32 %N, 0
@@ -1566,9 +1566,9 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: LV: Scalar loop costs: 16
-; CHECK: LV: Vector loop of width 2 costs: 21
-; CHECK: LV: Vector loop of width 4 costs: 14.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 2 costs: 26
+; CHECK: LV: Vector loop of width 4 costs: 19.
+; CHECK: LV: Selecting VF: 1.
define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp21.not = icmp eq i32 %N, 0
@@ -1608,8 +1608,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 22
-; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Vector loop of width 2 costs: 24
+; CHECK: LV: Vector loop of width 4 costs: 12
; CHECK: LV: Selecting VF: 4.
define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
@@ -1652,8 +1652,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: LV: Scalar loop costs: 18
-; CHECK: LV: Vector loop of width 2 costs: 22
-; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Vector loop of width 2 costs: 24
+; CHECK: LV: Vector loop of width 4 costs: 12
; CHECK: LV: Selecting VF: 4.
define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
@@ -1697,8 +1697,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: LV: Scalar loop costs: 16
; CHECK: LV: Vector loop of width 2 costs: 20
-; CHECK: LV: Vector loop of width 4 costs: 13.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 4 costs: 18
+; CHECK: LV: Selecting VF: 1
define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp22.not = icmp eq i32 %N, 0
@@ -1739,8 +1739,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
; CHECK: LV: Scalar loop costs: 16
; CHECK: LV: Vector loop of width 2 costs: 20
-; CHECK: LV: Vector loop of width 4 costs: 13.
-; CHECK: LV: Selecting VF: 4.
+; CHECK: LV: Vector loop of width 4 costs: 18
+; CHECK: LV: Selecting VF: 1
define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp21.not = icmp eq i32 %N, 0
@@ -1883,8 +1883,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 43
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 51
+; CHECK: LV: Vector loop of width 4 costs: 27
; CHECK: LV: Selecting VF: 4
define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
@@ -1943,8 +1943,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 43
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 51
+; CHECK: LV: Vector loop of width 4 costs: 27
; CHECK: LV: Selecting VF: 4
define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
@@ -2004,9 +2004,9 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 38
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 48
+; CHECK: LV: Vector loop of width 4 costs: 37
+; CHECK: LV: Selecting VF: 1
define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp48.not = icmp eq i32 %N, 0
@@ -2061,9 +2061,9 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 28
-; CHECK: LV: Vector loop of width 2 costs: 38
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 2 costs: 48
+; CHECK: LV: Vector loop of width 4 costs: 37
+; CHECK: LV: Selecting VF: 1
define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
@@ -2119,8 +2119,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 37
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 25
; CHECK: LV: Selecting VF: 4
define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
@@ -2181,8 +2181,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 32
-; CHECK: LV: Vector loop of width 2 costs: 37
-; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Vector loop of width 2 costs: 41
+; CHECK: LV: Vector loop of width 4 costs: 25
; CHECK: LV: Selecting VF: 4
define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
@@ -2244,8 +2244,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 28
; CHECK: LV: Vector loop of width 2 costs: 35
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 4 costs: 37
+; CHECK: LV: Selecting VF: 1
define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp48.not = icmp eq i32 %N, 0
@@ -2302,8 +2302,8 @@ for.body: ; preds = %entry, %for.body
; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
; CHECK: LV: Scalar loop costs: 28
; CHECK: LV: Vector loop of width 2 costs: 35
-; CHECK: LV: Vector loop of width 4 costs: 26
-; CHECK: LV: Selecting VF: 4
+; CHECK: LV: Vector loop of width 4 costs: 37
+; CHECK: LV: Selecting VF: 1
define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
entry:
%cmp45.not = icmp eq i32 %N, 0
More information about the llvm-commits
mailing list