[llvm] [CodeGen][TTI] Reduce funnel shift cost for constant shift amounts (PR #184942)
Jim Lin via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 9 18:30:58 PDT 2026
https://github.com/tclin914 updated https://github.com/llvm/llvm-project/pull/184942
>From 583c977032ae287e29c60c1448c9dc46ccbd6daa Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Mon, 9 Mar 2026 15:54:16 +0800
Subject: [PATCH 1/2] [RISCV] Pre-commit test
---
.../SLPVectorizer/RISCV/funnel-shift-cost.ll | 68 +++++++++++++++++++
1 file changed, 68 insertions(+)
create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/funnel-shift-cost.ll
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/funnel-shift-cost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/funnel-shift-cost.ll
new file mode 100644
index 0000000000000..47759b2a3c4aa
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/funnel-shift-cost.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=riscv64 -mattr=+v -passes=slp-vectorizer -S < %s | FileCheck %s
+
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+
+define void @foo(i16 %lx3, ptr %extra_bits, i16 %init_count) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i16 [[LX3:%.*]], ptr [[EXTRA_BITS:%.*]], i16 [[INIT_COUNT:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: br label %[[WHILE_BODY:.*]]
+; CHECK: [[WHILE_BODY]]:
+; CHECK-NEXT: [[CTR:%.*]] = phi i16 [ [[INIT_COUNT]], %[[ENTRY]] ], [ [[CTR_DEC:%.*]], %[[USE_RESULTS:.*]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[USE_RESULTS]] ]
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LX3]], i32 3
+; CHECK-NEXT: [[TMP3]] = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP2]], <4 x i16> splat (i16 1))
+; CHECK-NEXT: store <4 x i16> [[TMP3]], ptr [[EXTRA_BITS]], align 2
+; CHECK-NEXT: br label %[[USE_RESULTS]]
+; CHECK: [[USE_RESULTS]]:
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+; CHECK-NEXT: [[SUM01:%.*]] = add i16 [[TMP4]], [[TMP5]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
+; CHECK-NEXT: [[SUM23:%.*]] = sub i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[SUM:%.*]] = add i16 [[SUM01]], [[SUM23]]
+; CHECK-NEXT: store i16 [[SUM]], ptr [[EXTRA_BITS]], align 2
+; CHECK-NEXT: [[CTR_DEC]] = add i16 [[CTR]], -1
+; CHECK-NEXT: [[DONE:%.*]] = icmp sgt i16 [[CTR_DEC]], -1
+; CHECK-NEXT: br i1 [[DONE]], label %[[WHILE_BODY]], label %[[EXIT:.*]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %eb1_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 2
+ %eb2_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 4
+ %eb3_ptr = getelementptr inbounds nuw i8, ptr %extra_bits, i64 6
+ br label %while.body
+
+while.body:
+ %eb0 = phi i16 [ 0, %entry ], [ %new_eb0, %use.results ]
+ %eb1 = phi i16 [ 0, %entry ], [ %new_eb1, %use.results ]
+ %eb2 = phi i16 [ 0, %entry ], [ %new_eb2, %use.results ]
+ %eb3 = phi i16 [ 0, %entry ], [ %new_eb3, %use.results ]
+ %ctr = phi i16 [ %init_count, %entry ], [ %ctr.dec, %use.results ]
+
+ %new_eb3 = tail call i16 @llvm.fshl.i16(i16 %eb3, i16 %lx3, i16 1)
+ store i16 %new_eb3, ptr %eb3_ptr, align 2
+ %new_eb2 = tail call i16 @llvm.fshl.i16(i16 %eb2, i16 %eb3, i16 1)
+ store i16 %new_eb2, ptr %eb2_ptr, align 2
+ %new_eb1 = tail call i16 @llvm.fshl.i16(i16 %eb1, i16 %eb2, i16 1)
+ store i16 %new_eb1, ptr %eb1_ptr, align 2
+ %new_eb0 = tail call i16 @llvm.fshl.i16(i16 %eb0, i16 %eb1, i16 1)
+ store i16 %new_eb0, ptr %extra_bits, align 2
+ br label %use.results
+
+use.results:
+ %sum01 = add i16 %new_eb0, %new_eb1
+ %sum23 = sub i16 %new_eb2, %new_eb3
+ %sum = add i16 %sum01, %sum23
+ store i16 %sum, ptr %extra_bits, align 2
+ %ctr.dec = add i16 %ctr, -1
+ %done = icmp sgt i16 %ctr.dec, -1
+ br i1 %done, label %while.body, label %exit
+
+exit:
+ ret void
+}
>From 45af52119996a422f776b720fd301db1135a1334 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Thu, 5 Mar 2026 16:24:15 +0800
Subject: [PATCH 2/2] [CodeGen][TTI] Reduce funnel shift cost for constant
shift amounts
The Sub instruction cost and the shift-by-zero handling costs (ICmp +
Select) are only needed when the shift amount is non-constant. Move them
inside the `!OpInfoZ.isConstant()` guard to avoid overestimating cost
for constant shift amounts.
The overestimated scalar cost caused SLP vectorizer to incorrectly
prefer vectorizing funnel shifts with constant shift amounts, since SLP
compares vector cost against scalar cost and a falsely high scalar cost
makes vectorization appear more profitable than it actually is.
Fixes #181308.
---
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 31 +-
llvm/test/Analysis/CostModel/AArch64/fshl.ll | 30 +-
llvm/test/Analysis/CostModel/AArch64/fshr.ll | 30 +-
.../Analysis/CostModel/RISCV/fshl_fshr.ll | 20 +-
llvm/test/Analysis/CostModel/X86/fshl.ll | 845 +++++++++---------
llvm/test/Analysis/CostModel/X86/fshr.ll | 845 +++++++++---------
.../SLPVectorizer/RISCV/funnel-shift-cost.ll | 47 +-
7 files changed, 903 insertions(+), 945 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index b0a7d11f08be7..6e0790f81c72e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2061,32 +2061,33 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
InstructionCost Cost = 0;
Cost +=
thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
- Cost +=
- thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
Cost += thisT()->getArithmeticInstrCost(
BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
{OpInfoZ.Kind, TTI::OP_None});
Cost += thisT()->getArithmeticInstrCost(
BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
{OpInfoZ.Kind, TTI::OP_None});
- // Non-constant shift amounts requires a modulo. If the typesize is a
- // power-2 then this will be converted to an and, otherwise it will use a
- // urem.
- if (!OpInfoZ.isConstant())
+
+ if (!OpInfoZ.isConstant()) {
+ Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
+ CostKind);
+ // Non-constant shift amounts requires a modulo. If the typesize is a
+ // power-2 then this will be converted to an and, otherwise it will use
+ // a urem.
Cost += thisT()->getArithmeticInstrCost(
isPowerOf2_32(RetTy->getScalarSizeInBits()) ? BinaryOperator::And
: BinaryOperator::URem,
RetTy, CostKind, OpInfoZ,
{TTI::OK_UniformConstantValue, TTI::OP_None});
- // For non-rotates (X != Y) we must add shift-by-zero handling costs.
- if (X != Y) {
- Type *CondTy = RetTy->getWithNewBitWidth(1);
- Cost +=
- thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
- CmpInst::ICMP_EQ, CostKind);
- Cost +=
- thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
- CmpInst::ICMP_EQ, CostKind);
+ // For non-rotates (X != Y) we must add shift-by-zero handling costs.
+ if (X != Y) {
+ Type *CondTy = RetTy->getWithNewBitWidth(1);
+ Cost += thisT()->getCmpSelInstrCost(
+ BinaryOperator::ICmp, RetTy, CondTy, CmpInst::ICMP_EQ, CostKind);
+ Cost +=
+ thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+ CmpInst::ICMP_EQ, CostKind);
+ }
}
return Cost;
}
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
index 61296a8e3c5d3..4219a4de764f5 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
@@ -85,7 +85,7 @@ entry:
define i128 @fshl_i128_3rd_arg_const(i128 %a, i128 %b) {
; CHECK-LABEL: 'fshl_i128_3rd_arg_const'
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %b, i128 9)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
;
entry:
@@ -136,7 +136,7 @@ entry:
define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: 'fshl_v16i8_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 6 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
;
entry:
@@ -166,7 +166,7 @@ entry:
define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: 'fshl_v8i16_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 6 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
;
entry:
@@ -196,7 +196,7 @@ entry:
define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: 'fshl_v4i32_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 6 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
;
entry:
@@ -226,7 +226,7 @@ entry:
define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: 'fshl_v2i64_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 6 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
;
entry:
@@ -256,7 +256,7 @@ entry:
define <2 x i66> @fshl_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
; CHECK-LABEL: 'fshl_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i66> @llvm.fshl.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %r
;
entry:
@@ -266,7 +266,7 @@ entry:
define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a, <2 x i128> %b) {
; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> splat (i128 1))
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> splat (i128 1))
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
;
entry:
@@ -276,7 +276,7 @@ entry:
define <2 x i128> @fshl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
; CHECK-LABEL: 'fshl_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
;
entry:
@@ -379,7 +379,7 @@ entry:
define i128 @rotl_i128_3rd_arg_const(i128 %a) {
; CHECK-LABEL: 'rotl_i128_3rd_arg_const'
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 9)
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call i128 @llvm.fshl.i128(i128 %a, i128 %a, i128 9)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
;
entry:
@@ -409,7 +409,7 @@ entry:
define <16 x i8> @rotl_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a) {
; CHECK-LABEL: 'rotl_v16i8_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
;
entry:
@@ -439,7 +439,7 @@ entry:
define <8 x i16> @rotl_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a) {
; CHECK-LABEL: 'rotl_v8i16_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
;
entry:
@@ -469,7 +469,7 @@ entry:
define <4 x i32> @rotl_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a) {
; CHECK-LABEL: 'rotl_v4i32_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
;
entry:
@@ -499,7 +499,7 @@ entry:
define <2 x i64> @rotl_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a) {
; CHECK-LABEL: 'rotl_v2i64_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
;
entry:
@@ -519,7 +519,7 @@ entry:
define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a) {
; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> splat (i128 1))
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> splat (i128 1))
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
;
entry:
@@ -529,7 +529,7 @@ entry:
define <2 x i128> @rotl_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a) {
; CHECK-LABEL: 'rotl_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i128> @llvm.fshl.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
;
entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshr.ll b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
index 1aa6de967739b..e3e84becd3e00 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshr.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
@@ -85,7 +85,7 @@ entry:
define i128 @fshr_i128_3rd_arg_const(i128 %a, i128 %b) {
; CHECK-LABEL: 'fshr_i128_3rd_arg_const'
-; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:8 Lat:8 SizeLat:8 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %b, i128 9)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
;
entry:
@@ -136,7 +136,7 @@ entry:
define <16 x i8> @fshr_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a, <16 x i8> %b) {
; CHECK-LABEL: 'fshr_v16i8_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 6 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
;
entry:
@@ -166,7 +166,7 @@ entry:
define <8 x i16> @fshr_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LABEL: 'fshr_v8i16_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 6 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
;
entry:
@@ -196,7 +196,7 @@ entry:
define <4 x i32> @fshr_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: 'fshr_v4i32_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 6 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
;
entry:
@@ -226,7 +226,7 @@ entry:
define <2 x i64> @fshr_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LABEL: 'fshr_v2i64_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 6 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> <i64 1, i64 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
;
entry:
@@ -256,7 +256,7 @@ entry:
define <2 x i66> @fshr_v2i66_3rd_arg_vec_const_lanes_different(<2 x i66> %a, <2 x i66> %b) {
; CHECK-LABEL: 'fshr_v2i66_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i66> @llvm.fshr.v2i66(<2 x i66> %a, <2 x i66> %b, <2 x i66> <i66 1, i66 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i66> %r
;
entry:
@@ -266,7 +266,7 @@ entry:
define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a, <2 x i128> %b) {
; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> splat (i128 1))
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> splat (i128 1))
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
;
entry:
@@ -276,7 +276,7 @@ entry:
define <2 x i128> @fshr_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a, <2 x i128> %b) {
; CHECK-LABEL: 'fshr_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:16 Lat:20 SizeLat:20 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 1, i128 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
;
entry:
@@ -379,7 +379,7 @@ entry:
define i128 @rotr_i128_3rd_arg_const(i128 %a) {
; CHECK-LABEL: 'rotr_i128_3rd_arg_const'
-; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 9)
+; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call i128 @llvm.fshr.i128(i128 %a, i128 %a, i128 9)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i128 %r
;
entry:
@@ -409,7 +409,7 @@ entry:
define <16 x i8> @rotr_v16i8_3rd_arg_vec_const_lanes_different(<16 x i8> %a) {
; CHECK-LABEL: 'rotr_v16i8_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 9, i8 1, i8 13, i8 7, i8 31, i8 23, i8 43, i8 51, i8 3, i8 3, i8 17, i8 3, i8 11, i8 15, i8 3, i8 3>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %r
;
entry:
@@ -439,7 +439,7 @@ entry:
define <8 x i16> @rotr_v8i16_3rd_arg_vec_const_lanes_different(<8 x i16> %a) {
; CHECK-LABEL: 'rotr_v8i16_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %a, <8 x i16> <i16 3, i16 1, i16 13, i16 8, i16 7, i16 31, i16 43, i16 51>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %r
;
entry:
@@ -469,7 +469,7 @@ entry:
define <4 x i32> @rotr_v4i32_3rd_arg_vec_const_lanes_different(<4 x i32> %a) {
; CHECK-LABEL: 'rotr_v4i32_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 3, i32 11, i32 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x i32> %r
;
entry:
@@ -499,7 +499,7 @@ entry:
define <2 x i64> @rotr_v2i64_3rd_arg_vec_const_lanes_different(<2 x i64> %a) {
; CHECK-LABEL: 'rotr_v2i64_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
+; CHECK-NEXT: Cost Model: Found costs of 3 for: %r = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> <i64 1, i64 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i64> %r
;
entry:
@@ -519,7 +519,7 @@ entry:
define <2 x i128> @rotr_v2i128_3rd_arg_vec_const_all_lanes_same(<2 x i128> %a) {
; CHECK-LABEL: 'rotr_v2i128_3rd_arg_vec_const_all_lanes_same'
-; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> splat (i128 1))
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> splat (i128 1))
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
;
entry:
@@ -529,7 +529,7 @@ entry:
define <2 x i128> @rotr_v2i128_3rd_arg_vec_const_lanes_different(<2 x i128> %a) {
; CHECK-LABEL: 'rotr_v2i128_3rd_arg_vec_const_lanes_different'
-; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
+; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call <2 x i128> @llvm.fshr.v2i128(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 1, i128 2>)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <2 x i128> %r
;
entry:
diff --git a/llvm/test/Analysis/CostModel/RISCV/fshl_fshr.ll b/llvm/test/Analysis/CostModel/RISCV/fshl_fshr.ll
index 04e6745d137e4..e6285f6cb95ac 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fshl_fshr.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fshl_fshr.ll
@@ -6,11 +6,11 @@
define i32 @rotl_i32_3rd_arg_const(i32 %a) {
; RV32-LABEL: 'rotl_i32_3rd_arg_const'
-; RV32-NEXT: Cost Model: Found costs of 4 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 9)
+; RV32-NEXT: Cost Model: Found costs of 3 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 9)
; RV32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
;
; RV64-LABEL: 'rotl_i32_3rd_arg_const'
-; RV64-NEXT: Cost Model: Found costs of 4 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 9)
+; RV64-NEXT: Cost Model: Found costs of 3 for: %r = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 9)
; RV64-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
;
; RV32ZBB-LABEL: 'rotl_i32_3rd_arg_const'
@@ -50,15 +50,15 @@ entry:
define i64 @rotl_i64_3rd_arg_const(i64 %a) {
; RV32-LABEL: 'rotl_i64_3rd_arg_const'
-; RV32-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
+; RV32-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
; RV32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
;
; RV64-LABEL: 'rotl_i64_3rd_arg_const'
-; RV64-NEXT: Cost Model: Found costs of 4 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
+; RV64-NEXT: Cost Model: Found costs of 3 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
; RV64-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
;
; RV32ZBB-LABEL: 'rotl_i64_3rd_arg_const'
-; RV32ZBB-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
+; RV32ZBB-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 9)
; RV32ZBB-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
;
; RV64ZBB-LABEL: 'rotl_i64_3rd_arg_const'
@@ -94,11 +94,11 @@ entry:
define i32 @rotr_i32_3rd_arg_const(i32 %a) {
; RV32-LABEL: 'rotr_i32_3rd_arg_const'
-; RV32-NEXT: Cost Model: Found costs of 4 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 9)
+; RV32-NEXT: Cost Model: Found costs of 3 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 9)
; RV32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
;
; RV64-LABEL: 'rotr_i32_3rd_arg_const'
-; RV64-NEXT: Cost Model: Found costs of 4 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 9)
+; RV64-NEXT: Cost Model: Found costs of 3 for: %r = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 9)
; RV64-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i32 %r
;
; RV32ZBB-LABEL: 'rotr_i32_3rd_arg_const'
@@ -138,15 +138,15 @@ entry:
define i64 @rotr_i64_3rd_arg_const(i64 %a) {
; RV32-LABEL: 'rotr_i64_3rd_arg_const'
-; RV32-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
+; RV32-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
; RV32-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
;
; RV64-LABEL: 'rotr_i64_3rd_arg_const'
-; RV64-NEXT: Cost Model: Found costs of 4 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
+; RV64-NEXT: Cost Model: Found costs of 3 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
; RV64-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
;
; RV32ZBB-LABEL: 'rotr_i64_3rd_arg_const'
-; RV32ZBB-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
+; RV32ZBB-NEXT: Cost Model: Found costs of RThru:6 CodeSize:3 Lat:3 SizeLat:3 for: %r = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 9)
; RV32ZBB-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret i64 %r
;
; RV64ZBB-LABEL: 'rotr_i64_3rd_arg_const'
diff --git a/llvm/test/Analysis/CostModel/X86/fshl.ll b/llvm/test/Analysis/CostModel/X86/fshl.ll
index c53a5072ff984..7b4ccde2bdaaf 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl.ll
@@ -870,53 +870,46 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
;
define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
-; SSSE3-LABEL: 'constant_funnel_i64'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:17 CodeSize:20 Lat:21 SizeLat:25 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:34 CodeSize:40 Lat:42 SizeLat:50 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:68 CodeSize:80 Lat:84 SizeLat:100 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'constant_funnel_i64'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:13 CodeSize:14 Lat:19 SizeLat:21 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:26 CodeSize:28 Lat:38 SizeLat:42 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:52 CodeSize:56 Lat:76 SizeLat:84 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'constant_funnel_i64'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; SSE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SSE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_funnel_i64'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:12 Lat:14 SizeLat:18 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:24 CodeSize:34 Lat:22 SizeLat:46 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:68 Lat:44 SizeLat:92 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:9 SizeLat:13 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:23 Lat:15 SizeLat:32 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:26 CodeSize:46 Lat:30 SizeLat:64 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_funnel_i64'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:12 SizeLat:8 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:6 Lat:13 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:26 CodeSize:12 Lat:26 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:3 Lat:9 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:6 Lat:18 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_funnel_i64'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i64'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'constant_funnel_i64'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_funnel_i64'
@@ -928,23 +921,23 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
;
; SLM-LABEL: 'constant_funnel_i64'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:19 CodeSize:14 Lat:24 SizeLat:22 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:38 CodeSize:28 Lat:48 SizeLat:44 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SLM-NEXT: Cost Model: Found costs of RThru:76 CodeSize:56 Lat:96 SizeLat:88 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_funnel_i64'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:13 CodeSize:14 Lat:19 SizeLat:21 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:26 CodeSize:28 Lat:38 SizeLat:42 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; GLM-NEXT: Cost Model: Found costs of RThru:52 CodeSize:56 Lat:76 SizeLat:84 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_funnel_i64'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:22 Lat:22 SizeLat:28 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:44 Lat:44 SizeLat:56 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'constant_funnel_i64'
@@ -964,51 +957,51 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
; SSSE3-LABEL: 'constant_funnel_i32'
; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:23 CodeSize:28 Lat:25 SizeLat:32 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:46 CodeSize:55 Lat:49 SizeLat:63 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:92 CodeSize:109 Lat:97 SizeLat:125 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:19 CodeSize:23 Lat:21 SizeLat:27 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:38 CodeSize:46 Lat:42 SizeLat:54 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:76 CodeSize:92 Lat:84 SizeLat:108 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_funnel_i32'
; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:23 CodeSize:20 Lat:33 SizeLat:25 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:46 CodeSize:39 Lat:65 SizeLat:49 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:92 CodeSize:77 Lat:129 SizeLat:97 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:19 CodeSize:17 Lat:29 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:38 CodeSize:34 Lat:58 SizeLat:42 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:76 CodeSize:68 Lat:116 SizeLat:84 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_funnel_i32'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:24 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:31 CodeSize:43 Lat:30 SizeLat:60 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:62 CodeSize:86 Lat:60 SizeLat:120 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:14 Lat:13 SizeLat:20 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:32 Lat:23 SizeLat:46 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:40 CodeSize:64 Lat:46 SizeLat:92 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_funnel_i32'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:11 SizeLat:11 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:6 Lat:13 SizeLat:14 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:26 CodeSize:12 Lat:26 SizeLat:28 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:3 Lat:9 SizeLat:8 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:6 Lat:18 SizeLat:16 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_funnel_i32'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i32'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'constant_funnel_i32'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_funnel_i32'
@@ -1020,23 +1013,23 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
;
; SLM-LABEL: 'constant_funnel_i32'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:20 Lat:33 SizeLat:31 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:39 Lat:65 SizeLat:61 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:77 Lat:129 SizeLat:121 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:17 Lat:29 SizeLat:27 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:56 CodeSize:34 Lat:58 SizeLat:54 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT: Cost Model: Found costs of RThru:112 CodeSize:68 Lat:116 SizeLat:108 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_funnel_i32'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:23 CodeSize:20 Lat:33 SizeLat:25 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:46 CodeSize:39 Lat:65 SizeLat:49 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; GLM-NEXT: Cost Model: Found costs of RThru:92 CodeSize:77 Lat:129 SizeLat:97 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT: Cost Model: Found costs of RThru:19 CodeSize:17 Lat:29 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:38 CodeSize:34 Lat:58 SizeLat:42 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT: Cost Model: Found costs of RThru:76 CodeSize:68 Lat:116 SizeLat:84 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_funnel_i32'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:22 Lat:22 SizeLat:28 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:44 Lat:44 SizeLat:56 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'constant_funnel_i32'
@@ -1056,51 +1049,51 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
; SSSE3-LABEL: 'constant_funnel_i16'
; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:22 CodeSize:38 Lat:29 SizeLat:38 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:44 CodeSize:75 Lat:57 SizeLat:75 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:88 CodeSize:149 Lat:113 SizeLat:149 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:18 CodeSize:33 Lat:25 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:36 CodeSize:66 Lat:50 SizeLat:66 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:72 CodeSize:132 Lat:100 SizeLat:132 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_funnel_i16'
; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:28 Lat:36 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:56 CodeSize:55 Lat:71 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:112 CodeSize:109 Lat:141 SizeLat:129 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:24 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:48 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:96 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_funnel_i16'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:19 CodeSize:19 Lat:26 SizeLat:28 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:44 CodeSize:50 Lat:46 SizeLat:71 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:88 CodeSize:100 Lat:92 SizeLat:142 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:16 Lat:22 SizeLat:24 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:39 SizeLat:56 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:66 CodeSize:74 Lat:78 SizeLat:112 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_funnel_i16'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:10 Lat:21 SizeLat:16 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:20 SizeLat:24 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:30 CodeSize:30 Lat:40 SizeLat:48 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:18 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:32 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_funnel_i16'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:20 SizeLat:15 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:19 SizeLat:22 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:31 CodeSize:37 Lat:49 SizeLat:45 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:17 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:23 CodeSize:23 Lat:33 SizeLat:31 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i16'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'constant_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:20 SizeLat:15 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:19 SizeLat:22 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:31 CodeSize:37 Lat:49 SizeLat:45 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:17 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:23 CodeSize:23 Lat:33 SizeLat:31 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_funnel_i16'
@@ -1112,23 +1105,23 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
;
; SLM-LABEL: 'constant_funnel_i16'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:31 CodeSize:28 Lat:38 SizeLat:34 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:62 CodeSize:55 Lat:75 SizeLat:67 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:124 CodeSize:109 Lat:149 SizeLat:133 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:25 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:50 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:100 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_funnel_i16'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:28 Lat:36 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:56 CodeSize:55 Lat:71 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:112 CodeSize:109 Lat:141 SizeLat:129 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:24 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:48 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:96 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_funnel_i16'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:22 SizeLat:29 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:44 SizeLat:58 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'constant_funnel_i16'
@@ -1148,86 +1141,86 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
; SSSE3-LABEL: 'constant_funnel_i8'
; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:32 CodeSize:59 Lat:54 SizeLat:64 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:64 CodeSize:117 Lat:107 SizeLat:127 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:128 CodeSize:233 Lat:213 SizeLat:253 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:54 Lat:50 SizeLat:59 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:56 CodeSize:108 Lat:100 SizeLat:118 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:112 CodeSize:216 Lat:200 SizeLat:236 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_funnel_i8'
; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:36 CodeSize:39 Lat:56 SizeLat:51 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:72 CodeSize:77 Lat:111 SizeLat:101 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:144 CodeSize:153 Lat:221 SizeLat:201 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_funnel_i8'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:26 CodeSize:27 Lat:53 SizeLat:40 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:57 CodeSize:71 Lat:53 SizeLat:100 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:114 CodeSize:142 Lat:106 SizeLat:200 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:49 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:46 CodeSize:58 Lat:46 SizeLat:85 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:92 CodeSize:116 Lat:92 SizeLat:170 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_funnel_i8'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:17 CodeSize:27 Lat:53 SizeLat:39 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:19 CodeSize:27 Lat:58 SizeLat:54 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:38 CodeSize:54 Lat:116 SizeLat:108 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:48 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:30 CodeSize:48 Lat:108 SizeLat:96 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_funnel_i8'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:16 CodeSize:27 Lat:52 SizeLat:38 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:18 CodeSize:27 Lat:57 SizeLat:52 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:39 CodeSize:72 Lat:55 SizeLat:84 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:31 CodeSize:58 Lat:39 SizeLat:70 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i8'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:20 SizeLat:14 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:12 CodeSize:27 Lat:57 SizeLat:38 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:17 CodeSize:32 Lat:50 SizeLat:37 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:17 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:9 CodeSize:24 Lat:54 SizeLat:35 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:14 CodeSize:29 Lat:47 SizeLat:34 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'constant_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:16 CodeSize:27 Lat:52 SizeLat:38 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:18 CodeSize:27 Lat:57 SizeLat:52 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:39 CodeSize:72 Lat:55 SizeLat:84 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:31 CodeSize:58 Lat:39 SizeLat:70 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_funnel_i8'
; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:20 SizeLat:14 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:27 Lat:57 SizeLat:38 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:17 CodeSize:32 Lat:50 SizeLat:37 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:17 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:24 Lat:54 SizeLat:35 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:29 Lat:47 SizeLat:34 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SLM-LABEL: 'constant_funnel_i8'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:38 CodeSize:39 Lat:58 SizeLat:52 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SLM-NEXT: Cost Model: Found costs of RThru:76 CodeSize:77 Lat:115 SizeLat:103 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SLM-NEXT: Cost Model: Found costs of RThru:152 CodeSize:153 Lat:229 SizeLat:205 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_funnel_i8'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:39 Lat:56 SizeLat:51 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; GLM-NEXT: Cost Model: Found costs of RThru:72 CodeSize:77 Lat:111 SizeLat:101 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; GLM-NEXT: Cost Model: Found costs of RThru:144 CodeSize:153 Lat:221 SizeLat:201 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_funnel_i8'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:22 SizeLat:29 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:44 SizeLat:58 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'constant_funnel_i8'
; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:20 SizeLat:14 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:12 CodeSize:27 Lat:57 SizeLat:38 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:17 CodeSize:32 Lat:50 SizeLat:37 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:17 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:9 CodeSize:24 Lat:54 SizeLat:35 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:14 CodeSize:29 Lat:47 SizeLat:34 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
@@ -1242,53 +1235,46 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
;
define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
-; SSSE3-LABEL: 'splatconstant_funnel_i64'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:11 SizeLat:13 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:22 SizeLat:26 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:44 CodeSize:48 Lat:44 SizeLat:52 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'splatconstant_funnel_i64'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:12 Lat:18 SizeLat:18 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:24 Lat:36 SizeLat:36 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'splatconstant_funnel_i64'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_funnel_i64'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:10 SizeLat:8 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:20 Lat:20 SizeLat:26 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX1-NEXT: Cost Model: Found costs of RThru:36 CodeSize:40 Lat:40 SizeLat:52 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_funnel_i64'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:8 SizeLat:8 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:12 Lat:18 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_funnel_i64'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i64'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'splatconstant_funnel_i64'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_funnel_i64'
@@ -1300,23 +1286,23 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
;
; SLM-LABEL: 'splatconstant_funnel_i64'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:13 CodeSize:6 Lat:14 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; SLM-NEXT: Cost Model: Found costs of RThru:26 CodeSize:12 Lat:28 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; SLM-NEXT: Cost Model: Found costs of RThru:52 CodeSize:24 Lat:56 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_funnel_i64'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; GLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:12 Lat:18 SizeLat:18 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:24 Lat:36 SizeLat:36 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_funnel_i64'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:20 Lat:20 SizeLat:26 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; XOP-NEXT: Cost Model: Found costs of RThru:36 CodeSize:40 Lat:40 SizeLat:52 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; XOP-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
@@ -1334,53 +1320,46 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
}
define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
-; SSSE3-LABEL: 'splatconstant_funnel_i32'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:8 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:13 SizeLat:15 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:29 Lat:25 SizeLat:29 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'splatconstant_funnel_i32'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'splatconstant_funnel_i32'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_funnel_i32'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:20 Lat:20 SizeLat:26 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX1-NEXT: Cost Model: Found costs of RThru:36 CodeSize:40 Lat:40 SizeLat:52 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_funnel_i32'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:12 Lat:18 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_funnel_i32'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i32'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'splatconstant_funnel_i32'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_funnel_i32'
@@ -1392,23 +1371,23 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
;
; SLM-LABEL: 'splatconstant_funnel_i32'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; SLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; SLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; SLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_funnel_i32'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; GLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; GLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_funnel_i32'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:20 Lat:20 SizeLat:26 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; XOP-NEXT: Cost Model: Found costs of RThru:36 CodeSize:40 Lat:40 SizeLat:52 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; XOP-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
@@ -1426,53 +1405,46 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
}
define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; SSSE3-LABEL: 'splatconstant_funnel_i16'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:8 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:13 SizeLat:15 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:29 Lat:25 SizeLat:29 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'splatconstant_funnel_i16'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'splatconstant_funnel_i16'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_funnel_i16'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:20 SizeLat:27 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:40 SizeLat:54 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_funnel_i16'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:12 Lat:18 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_funnel_i16'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:18 SizeLat:14 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:19 CodeSize:25 Lat:37 SizeLat:29 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:15 SizeLat:9 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:21 SizeLat:15 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i16'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'splatconstant_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:18 SizeLat:14 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:19 CodeSize:25 Lat:37 SizeLat:29 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:15 SizeLat:9 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:21 SizeLat:15 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_funnel_i16'
@@ -1484,23 +1456,23 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
;
; SLM-LABEL: 'splatconstant_funnel_i16'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; SLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:8 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:11 Lat:17 SizeLat:15 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:21 Lat:33 SizeLat:29 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_funnel_i16'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; GLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_funnel_i16'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:20 SizeLat:27 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; XOP-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:40 SizeLat:54 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
@@ -1518,88 +1490,81 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
}
define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; SSSE3-LABEL: 'splatconstant_funnel_i8'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:10 Lat:19 SizeLat:12 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:14 CodeSize:19 Lat:37 SizeLat:23 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:37 Lat:73 SizeLat:45 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'splatconstant_funnel_i8'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:19 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:37 SizeLat:21 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:29 Lat:73 SizeLat:41 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'splatconstant_funnel_i8'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; SSE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_funnel_i8'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:8 Lat:19 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:28 Lat:22 SizeLat:33 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:40 CodeSize:56 Lat:44 SizeLat:66 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:15 Lat:15 SizeLat:18 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:30 Lat:30 SizeLat:36 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_funnel_i8'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:21 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:8 Lat:21 SizeLat:16 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:16 Lat:42 SizeLat:32 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:34 SizeLat:20 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_funnel_i8'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:20 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:25 Lat:41 SizeLat:27 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:11 Lat:25 SizeLat:13 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i8'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:18 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'splatconstant_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:20 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:25 Lat:41 SizeLat:27 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:11 Lat:25 SizeLat:13 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_funnel_i8'
; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:18 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SLM-LABEL: 'splatconstant_funnel_i8'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; SLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:8 Lat:21 SizeLat:12 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:15 Lat:41 SizeLat:23 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:29 Lat:81 SizeLat:45 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_funnel_i8'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; GLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:19 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:37 SizeLat:21 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:29 Lat:73 SizeLat:41 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_funnel_i8'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:22 SizeLat:29 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:44 SizeLat:58 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:8 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:8 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:13 SizeLat:5 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:13 SizeLat:5 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:13 SizeLat:5 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
@@ -2293,23 +2258,23 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
; SSE-LABEL: 'constant_rotate_i64'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; SSE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:12 Lat:15 SizeLat:17 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; SSE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:30 SizeLat:34 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SSE-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:60 SizeLat:68 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; SSE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_rotate_i64'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:10 SizeLat:14 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:17 CodeSize:28 Lat:17 SizeLat:38 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:34 CodeSize:56 Lat:34 SizeLat:76 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:9 SizeLat:13 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:23 Lat:15 SizeLat:32 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:26 CodeSize:46 Lat:30 SizeLat:64 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_rotate_i64'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:8 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:10 SizeLat:8 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:20 CodeSize:8 Lat:20 SizeLat:16 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:3 Lat:9 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:6 Lat:18 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'constant_rotate_i64'
@@ -2321,16 +2286,16 @@ define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
;
; SLM-LABEL: 'constant_rotate_i64'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:13 CodeSize:12 Lat:15 SizeLat:17 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:26 CodeSize:24 Lat:30 SizeLat:34 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SLM-NEXT: Cost Model: Found costs of RThru:52 CodeSize:48 Lat:60 SizeLat:68 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_rotate_i64'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:10 CodeSize:12 Lat:15 SizeLat:17 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:30 SizeLat:34 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; GLM-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:60 SizeLat:68 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_rotate_i64'
@@ -2350,30 +2315,30 @@ define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
; SSSE3-LABEL: 'constant_rotate_i32'
; SSSE3-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:22 SizeLat:28 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:40 CodeSize:47 Lat:43 SizeLat:55 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:80 CodeSize:93 Lat:85 SizeLat:109 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:19 CodeSize:23 Lat:21 SizeLat:27 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:38 CodeSize:46 Lat:42 SizeLat:54 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:76 CodeSize:92 Lat:84 SizeLat:108 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_rotate_i32'
; SSE42-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:20 CodeSize:18 Lat:30 SizeLat:22 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:40 CodeSize:35 Lat:59 SizeLat:43 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:80 CodeSize:69 Lat:117 SizeLat:85 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:19 CodeSize:17 Lat:29 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:38 CodeSize:34 Lat:58 SizeLat:42 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:76 CodeSize:68 Lat:116 SizeLat:84 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_rotate_i32'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:15 Lat:14 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:24 CodeSize:37 Lat:25 SizeLat:52 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:74 Lat:50 SizeLat:104 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:14 Lat:13 SizeLat:20 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:32 Lat:23 SizeLat:46 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:40 CodeSize:64 Lat:46 SizeLat:92 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_rotate_i32'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:8 SizeLat:8 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:10 SizeLat:10 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:20 CodeSize:8 Lat:20 SizeLat:20 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:3 Lat:9 SizeLat:8 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:6 Lat:18 SizeLat:16 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'constant_rotate_i32'
@@ -2385,16 +2350,16 @@ define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
;
; SLM-LABEL: 'constant_rotate_i32'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:29 CodeSize:18 Lat:30 SizeLat:28 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:58 CodeSize:35 Lat:59 SizeLat:55 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SLM-NEXT: Cost Model: Found costs of RThru:116 CodeSize:69 Lat:117 SizeLat:109 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:17 Lat:29 SizeLat:27 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:56 CodeSize:34 Lat:58 SizeLat:54 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT: Cost Model: Found costs of RThru:112 CodeSize:68 Lat:116 SizeLat:108 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_rotate_i32'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:20 CodeSize:18 Lat:30 SizeLat:22 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:40 CodeSize:35 Lat:59 SizeLat:43 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; GLM-NEXT: Cost Model: Found costs of RThru:80 CodeSize:69 Lat:117 SizeLat:85 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT: Cost Model: Found costs of RThru:19 CodeSize:17 Lat:29 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:38 CodeSize:34 Lat:58 SizeLat:42 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT: Cost Model: Found costs of RThru:76 CodeSize:68 Lat:116 SizeLat:84 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_rotate_i32'
@@ -2414,37 +2379,37 @@ define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
; SSSE3-LABEL: 'constant_rotate_i16'
; SSSE3-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:19 CodeSize:34 Lat:26 SizeLat:34 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:38 CodeSize:67 Lat:51 SizeLat:67 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:76 CodeSize:133 Lat:101 SizeLat:133 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:18 CodeSize:33 Lat:25 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:36 CodeSize:66 Lat:50 SizeLat:66 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:72 CodeSize:132 Lat:100 SizeLat:132 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_rotate_i16'
; SSE42-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:25 CodeSize:26 Lat:33 SizeLat:30 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:50 CodeSize:51 Lat:65 SizeLat:59 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:100 CodeSize:101 Lat:129 SizeLat:117 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:24 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:48 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:96 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_rotate_i16'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:17 Lat:23 SizeLat:25 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:37 CodeSize:42 Lat:41 SizeLat:62 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:74 CodeSize:84 Lat:82 SizeLat:124 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:16 Lat:22 SizeLat:24 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:39 SizeLat:56 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:66 CodeSize:74 Lat:78 SizeLat:112 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_rotate_i16'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:13 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:17 SizeLat:20 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:24 CodeSize:26 Lat:34 SizeLat:40 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:18 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:32 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_rotate_i16'
; AVX512F-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:13 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:17 SizeLat:19 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:26 CodeSize:28 Lat:40 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:17 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:23 CodeSize:23 Lat:33 SizeLat:31 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_rotate_i16'
@@ -2456,9 +2421,9 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
;
; AVX512DQ-LABEL: 'constant_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:13 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:17 SizeLat:19 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:26 CodeSize:28 Lat:40 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:17 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:23 CodeSize:23 Lat:33 SizeLat:31 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_rotate_i16'
@@ -2470,16 +2435,16 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
;
; SLM-LABEL: 'constant_rotate_i16'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:26 CodeSize:26 Lat:33 SizeLat:30 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:52 CodeSize:51 Lat:65 SizeLat:59 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:104 CodeSize:101 Lat:129 SizeLat:117 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:25 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:50 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:100 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_rotate_i16'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:25 CodeSize:26 Lat:33 SizeLat:30 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:50 CodeSize:51 Lat:65 SizeLat:59 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:100 CodeSize:101 Lat:129 SizeLat:117 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:24 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:48 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:96 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_rotate_i16'
@@ -2506,37 +2471,37 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
; SSSE3-LABEL: 'constant_rotate_i8'
; SSSE3-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:29 CodeSize:55 Lat:51 SizeLat:60 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:58 CodeSize:109 Lat:101 SizeLat:119 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:116 CodeSize:217 Lat:201 SizeLat:237 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:54 Lat:50 SizeLat:59 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:56 CodeSize:108 Lat:100 SizeLat:118 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:112 CodeSize:216 Lat:200 SizeLat:236 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_rotate_i8'
; SSE42-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:53 SizeLat:48 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:66 CodeSize:73 Lat:105 SizeLat:95 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:132 CodeSize:145 Lat:209 SizeLat:189 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_rotate_i8'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:23 CodeSize:25 Lat:50 SizeLat:37 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:50 CodeSize:63 Lat:48 SizeLat:91 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:100 CodeSize:126 Lat:96 SizeLat:182 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:49 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:46 CodeSize:58 Lat:46 SizeLat:85 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:92 CodeSize:116 Lat:92 SizeLat:170 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_rotate_i8'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:25 Lat:50 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:25 Lat:55 SizeLat:50 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:32 CodeSize:50 Lat:110 SizeLat:100 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:48 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:30 CodeSize:48 Lat:108 SizeLat:96 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_rotate_i8'
; AVX512F-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:25 Lat:50 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:16 CodeSize:25 Lat:55 SizeLat:49 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:34 CodeSize:63 Lat:46 SizeLat:75 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:31 CodeSize:58 Lat:39 SizeLat:70 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_rotate_i8'
@@ -2548,9 +2513,9 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
;
; AVX512DQ-LABEL: 'constant_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:25 Lat:50 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:16 CodeSize:25 Lat:55 SizeLat:49 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:34 CodeSize:63 Lat:46 SizeLat:75 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:31 CodeSize:58 Lat:39 SizeLat:70 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_rotate_i8'
@@ -2562,16 +2527,16 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
;
; SLM-LABEL: 'constant_rotate_i8'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:53 SizeLat:48 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SLM-NEXT: Cost Model: Found costs of RThru:66 CodeSize:73 Lat:105 SizeLat:95 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SLM-NEXT: Cost Model: Found costs of RThru:132 CodeSize:145 Lat:209 SizeLat:189 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_rotate_i8'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:53 SizeLat:48 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; GLM-NEXT: Cost Model: Found costs of RThru:66 CodeSize:73 Lat:105 SizeLat:95 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; GLM-NEXT: Cost Model: Found costs of RThru:132 CodeSize:145 Lat:209 SizeLat:189 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_rotate_i8'
@@ -2602,23 +2567,23 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
; SSE-LABEL: 'splatconstant_rotate_i64'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:5 SizeLat:5 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; SSE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:16 Lat:20 SizeLat:20 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_rotate_i64'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:14 Lat:15 SizeLat:18 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:28 Lat:30 SizeLat:36 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_rotate_i64'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; AVX2-NEXT: Cost Model: Found costs of 4 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:6 SizeLat:8 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:8 Lat:12 SizeLat:16 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'splatconstant_rotate_i64'
@@ -2630,16 +2595,16 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
;
; SLM-LABEL: 'splatconstant_rotate_i64'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:5 SizeLat:5 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; SLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:8 Lat:10 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; SLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:16 Lat:20 SizeLat:20 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_rotate_i64'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:5 SizeLat:5 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; GLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; GLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:16 Lat:20 SizeLat:20 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_rotate_i64'
@@ -2659,23 +2624,23 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
; SSE-LABEL: 'splatconstant_rotate_i32'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
-; SSE-NEXT: Cost Model: Found costs of 4 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; SSE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_rotate_i32'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
-; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:14 Lat:15 SizeLat:18 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:28 Lat:30 SizeLat:36 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_rotate_i32'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
-; AVX2-NEXT: Cost Model: Found costs of 4 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:6 SizeLat:8 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:8 Lat:12 SizeLat:16 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'splatconstant_rotate_i32'
@@ -2687,16 +2652,16 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
;
; SLM-LABEL: 'splatconstant_rotate_i32'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
-; SLM-NEXT: Cost Model: Found costs of 4 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; SLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_rotate_i32'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
-; GLM-NEXT: Cost Model: Found costs of 4 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; GLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; GLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_rotate_i32'
@@ -2716,30 +2681,30 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
; SSE-LABEL: 'splatconstant_rotate_i16'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; SSE-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; SSE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_rotate_i16'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:14 Lat:15 SizeLat:18 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:28 Lat:30 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_rotate_i16'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; AVX2-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:6 SizeLat:8 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:8 Lat:12 SizeLat:16 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_rotate_i16'
; AVX512F-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512F-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:16 SizeLat:11 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:28 SizeLat:20 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:15 SizeLat:9 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:21 SizeLat:15 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_rotate_i16'
@@ -2751,9 +2716,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
;
; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512DQ-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:16 SizeLat:11 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:28 SizeLat:20 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:15 SizeLat:9 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:21 SizeLat:15 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_rotate_i16'
@@ -2765,16 +2730,16 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
;
; SLM-LABEL: 'splatconstant_rotate_i16'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; SLM-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_rotate_i16'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; GLM-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_rotate_i16'
@@ -2801,30 +2766,30 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
; SSE-LABEL: 'splatconstant_rotate_i8'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:11 Lat:31 SizeLat:15 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; SSE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:21 Lat:61 SizeLat:29 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_rotate_i8'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:20 Lat:17 SizeLat:24 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:26 CodeSize:40 Lat:34 SizeLat:48 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:15 Lat:15 SizeLat:18 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:30 Lat:30 SizeLat:36 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_rotate_i8'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:18 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:12 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:36 SizeLat:24 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:34 SizeLat:20 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_rotate_i8'
; AVX512F-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:18 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:11 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:16 Lat:32 SizeLat:18 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:11 Lat:25 SizeLat:13 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_rotate_i8'
@@ -2836,9 +2801,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
;
; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:18 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:11 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:16 Lat:32 SizeLat:18 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:11 Lat:25 SizeLat:13 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
@@ -2850,16 +2815,16 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
;
; SLM-LABEL: 'splatconstant_rotate_i8'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:11 Lat:31 SizeLat:15 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:21 Lat:61 SizeLat:29 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_rotate_i8'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:11 Lat:31 SizeLat:15 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:21 Lat:61 SizeLat:29 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_rotate_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/fshr.ll b/llvm/test/Analysis/CostModel/X86/fshr.ll
index 1990605061716..e34a213a205b5 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr.ll
@@ -870,53 +870,46 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
;
define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
-; SSSE3-LABEL: 'constant_funnel_i64'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:17 CodeSize:20 Lat:21 SizeLat:25 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:34 CodeSize:40 Lat:42 SizeLat:50 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:68 CodeSize:80 Lat:84 SizeLat:100 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'constant_funnel_i64'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:13 CodeSize:14 Lat:19 SizeLat:21 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:26 CodeSize:28 Lat:38 SizeLat:42 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:52 CodeSize:56 Lat:76 SizeLat:84 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'constant_funnel_i64'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; SSE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SSE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_funnel_i64'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:12 Lat:14 SizeLat:18 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:24 CodeSize:34 Lat:22 SizeLat:46 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:68 Lat:44 SizeLat:92 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:9 SizeLat:13 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:23 Lat:15 SizeLat:32 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:26 CodeSize:46 Lat:30 SizeLat:64 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_funnel_i64'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:12 SizeLat:8 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:6 Lat:13 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:26 CodeSize:12 Lat:26 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:3 Lat:9 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:6 Lat:18 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_funnel_i64'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i64'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'constant_funnel_i64'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_funnel_i64'
@@ -928,23 +921,23 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
;
; SLM-LABEL: 'constant_funnel_i64'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:19 CodeSize:14 Lat:24 SizeLat:22 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:38 CodeSize:28 Lat:48 SizeLat:44 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SLM-NEXT: Cost Model: Found costs of RThru:76 CodeSize:56 Lat:96 SizeLat:88 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_funnel_i64'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:13 CodeSize:14 Lat:19 SizeLat:21 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:26 CodeSize:28 Lat:38 SizeLat:42 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; GLM-NEXT: Cost Model: Found costs of RThru:52 CodeSize:56 Lat:76 SizeLat:84 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_funnel_i64'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:22 Lat:22 SizeLat:28 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:44 Lat:44 SizeLat:56 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'constant_funnel_i64'
@@ -964,51 +957,51 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
; SSSE3-LABEL: 'constant_funnel_i32'
; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:23 CodeSize:28 Lat:25 SizeLat:32 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:46 CodeSize:55 Lat:49 SizeLat:63 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:92 CodeSize:109 Lat:97 SizeLat:125 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:19 CodeSize:23 Lat:21 SizeLat:27 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:38 CodeSize:46 Lat:42 SizeLat:54 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:76 CodeSize:92 Lat:84 SizeLat:108 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_funnel_i32'
; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:23 CodeSize:20 Lat:33 SizeLat:25 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:46 CodeSize:39 Lat:65 SizeLat:49 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:92 CodeSize:77 Lat:129 SizeLat:97 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:19 CodeSize:17 Lat:29 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:38 CodeSize:34 Lat:58 SizeLat:42 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:76 CodeSize:68 Lat:116 SizeLat:84 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_funnel_i32'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:17 Lat:17 SizeLat:24 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:31 CodeSize:43 Lat:30 SizeLat:60 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:62 CodeSize:86 Lat:60 SizeLat:120 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:14 Lat:13 SizeLat:20 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:32 Lat:23 SizeLat:46 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:40 CodeSize:64 Lat:46 SizeLat:92 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_funnel_i32'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:11 SizeLat:11 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:6 Lat:13 SizeLat:14 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:26 CodeSize:12 Lat:26 SizeLat:28 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:3 Lat:9 SizeLat:8 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:6 Lat:18 SizeLat:16 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_funnel_i32'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i32'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'constant_funnel_i32'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_funnel_i32'
@@ -1020,23 +1013,23 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
;
; SLM-LABEL: 'constant_funnel_i32'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:20 Lat:33 SizeLat:31 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:39 Lat:65 SizeLat:61 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:77 Lat:129 SizeLat:121 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:17 Lat:29 SizeLat:27 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:56 CodeSize:34 Lat:58 SizeLat:54 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT: Cost Model: Found costs of RThru:112 CodeSize:68 Lat:116 SizeLat:108 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_funnel_i32'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:23 CodeSize:20 Lat:33 SizeLat:25 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:46 CodeSize:39 Lat:65 SizeLat:49 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; GLM-NEXT: Cost Model: Found costs of RThru:92 CodeSize:77 Lat:129 SizeLat:97 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT: Cost Model: Found costs of RThru:19 CodeSize:17 Lat:29 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:38 CodeSize:34 Lat:58 SizeLat:42 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT: Cost Model: Found costs of RThru:76 CodeSize:68 Lat:116 SizeLat:84 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_funnel_i32'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:22 Lat:22 SizeLat:28 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:44 Lat:44 SizeLat:56 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'constant_funnel_i32'
@@ -1056,51 +1049,51 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
; SSSE3-LABEL: 'constant_funnel_i16'
; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:22 CodeSize:38 Lat:29 SizeLat:38 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:44 CodeSize:75 Lat:57 SizeLat:75 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:88 CodeSize:149 Lat:113 SizeLat:149 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:18 CodeSize:33 Lat:25 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:36 CodeSize:66 Lat:50 SizeLat:66 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:72 CodeSize:132 Lat:100 SizeLat:132 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_funnel_i16'
; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:28 Lat:36 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:56 CodeSize:55 Lat:71 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:112 CodeSize:109 Lat:141 SizeLat:129 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:24 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:48 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:96 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_funnel_i16'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:19 CodeSize:19 Lat:26 SizeLat:28 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:44 CodeSize:50 Lat:46 SizeLat:71 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:88 CodeSize:100 Lat:92 SizeLat:142 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:16 Lat:22 SizeLat:24 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:39 SizeLat:56 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:66 CodeSize:74 Lat:78 SizeLat:112 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_funnel_i16'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:10 Lat:21 SizeLat:16 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:15 Lat:20 SizeLat:24 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:30 CodeSize:30 Lat:40 SizeLat:48 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:18 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:32 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_funnel_i16'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:20 SizeLat:15 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:19 SizeLat:22 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:31 CodeSize:37 Lat:49 SizeLat:45 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:17 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:23 CodeSize:23 Lat:33 SizeLat:31 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i16'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'constant_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:20 SizeLat:15 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:19 SizeLat:22 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:31 CodeSize:37 Lat:49 SizeLat:45 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:17 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:23 CodeSize:23 Lat:33 SizeLat:31 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_funnel_i16'
@@ -1112,23 +1105,23 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
;
; SLM-LABEL: 'constant_funnel_i16'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:31 CodeSize:28 Lat:38 SizeLat:34 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:62 CodeSize:55 Lat:75 SizeLat:67 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:124 CodeSize:109 Lat:149 SizeLat:133 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:25 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:50 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:100 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_funnel_i16'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:28 Lat:36 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:56 CodeSize:55 Lat:71 SizeLat:65 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:112 CodeSize:109 Lat:141 SizeLat:129 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:24 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:48 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:96 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_funnel_i16'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:22 SizeLat:29 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:44 SizeLat:58 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'constant_funnel_i16'
@@ -1148,86 +1141,86 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
; SSSE3-LABEL: 'constant_funnel_i8'
; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:32 CodeSize:59 Lat:54 SizeLat:64 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:64 CodeSize:117 Lat:107 SizeLat:127 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:128 CodeSize:233 Lat:213 SizeLat:253 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:54 Lat:50 SizeLat:59 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:56 CodeSize:108 Lat:100 SizeLat:118 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:112 CodeSize:216 Lat:200 SizeLat:236 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_funnel_i8'
; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:36 CodeSize:39 Lat:56 SizeLat:51 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:72 CodeSize:77 Lat:111 SizeLat:101 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:144 CodeSize:153 Lat:221 SizeLat:201 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_funnel_i8'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:26 CodeSize:27 Lat:53 SizeLat:40 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:57 CodeSize:71 Lat:53 SizeLat:100 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:114 CodeSize:142 Lat:106 SizeLat:200 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:49 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:46 CodeSize:58 Lat:46 SizeLat:85 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:92 CodeSize:116 Lat:92 SizeLat:170 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_funnel_i8'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:17 CodeSize:27 Lat:53 SizeLat:39 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:19 CodeSize:27 Lat:58 SizeLat:54 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:38 CodeSize:54 Lat:116 SizeLat:108 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:48 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:30 CodeSize:48 Lat:108 SizeLat:96 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_funnel_i8'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:16 CodeSize:27 Lat:52 SizeLat:38 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:18 CodeSize:27 Lat:57 SizeLat:52 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:39 CodeSize:72 Lat:55 SizeLat:84 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:31 CodeSize:58 Lat:39 SizeLat:70 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_funnel_i8'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:20 SizeLat:14 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:12 CodeSize:27 Lat:57 SizeLat:38 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:17 CodeSize:32 Lat:50 SizeLat:37 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:17 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:9 CodeSize:24 Lat:54 SizeLat:35 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:14 CodeSize:29 Lat:47 SizeLat:34 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'constant_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:16 CodeSize:27 Lat:52 SizeLat:38 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:18 CodeSize:27 Lat:57 SizeLat:52 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:39 CodeSize:72 Lat:55 SizeLat:84 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:31 CodeSize:58 Lat:39 SizeLat:70 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_funnel_i8'
; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:20 SizeLat:14 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:27 Lat:57 SizeLat:38 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:17 CodeSize:32 Lat:50 SizeLat:37 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:17 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:24 Lat:54 SizeLat:35 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:29 Lat:47 SizeLat:34 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SLM-LABEL: 'constant_funnel_i8'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:38 CodeSize:39 Lat:58 SizeLat:52 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SLM-NEXT: Cost Model: Found costs of RThru:76 CodeSize:77 Lat:115 SizeLat:103 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SLM-NEXT: Cost Model: Found costs of RThru:152 CodeSize:153 Lat:229 SizeLat:205 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_funnel_i8'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:39 Lat:56 SizeLat:51 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; GLM-NEXT: Cost Model: Found costs of RThru:72 CodeSize:77 Lat:111 SizeLat:101 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; GLM-NEXT: Cost Model: Found costs of RThru:144 CodeSize:153 Lat:221 SizeLat:201 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_funnel_i8'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:22 SizeLat:29 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:44 SizeLat:58 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'constant_funnel_i8'
; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:20 SizeLat:14 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:12 CodeSize:27 Lat:57 SizeLat:38 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:17 CodeSize:32 Lat:50 SizeLat:37 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:9 CodeSize:9 Lat:17 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:9 CodeSize:24 Lat:54 SizeLat:35 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:14 CodeSize:29 Lat:47 SizeLat:34 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
@@ -1242,53 +1235,46 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
;
define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, i64 %b64, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
-; SSSE3-LABEL: 'splatconstant_funnel_i64'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:11 SizeLat:13 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:22 SizeLat:26 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:44 CodeSize:48 Lat:44 SizeLat:52 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'splatconstant_funnel_i64'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:12 Lat:18 SizeLat:18 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:24 Lat:36 SizeLat:36 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'splatconstant_funnel_i64'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_funnel_i64'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:10 SizeLat:8 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:20 Lat:20 SizeLat:26 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX1-NEXT: Cost Model: Found costs of RThru:36 CodeSize:40 Lat:40 SizeLat:52 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_funnel_i64'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:8 SizeLat:8 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:12 Lat:18 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_funnel_i64'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i64'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'splatconstant_funnel_i64'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_funnel_i64'
@@ -1300,23 +1286,23 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
;
; SLM-LABEL: 'splatconstant_funnel_i64'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:13 CodeSize:6 Lat:14 SizeLat:10 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; SLM-NEXT: Cost Model: Found costs of RThru:26 CodeSize:12 Lat:28 SizeLat:20 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; SLM-NEXT: Cost Model: Found costs of RThru:52 CodeSize:24 Lat:56 SizeLat:40 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_funnel_i64'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:9 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; GLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:12 Lat:18 SizeLat:18 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:24 Lat:36 SizeLat:36 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_funnel_i64'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
-; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:20 Lat:20 SizeLat:26 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
-; XOP-NEXT: Cost Model: Found costs of RThru:36 CodeSize:40 Lat:40 SizeLat:52 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> splat (i64 7))
+; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> splat (i64 7))
+; XOP-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> splat (i64 7))
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
@@ -1334,53 +1320,46 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
}
define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, i32 %b32, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
-; SSSE3-LABEL: 'splatconstant_funnel_i32'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:8 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:13 SizeLat:15 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:29 Lat:25 SizeLat:29 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'splatconstant_funnel_i32'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'splatconstant_funnel_i32'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_funnel_i32'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:20 Lat:20 SizeLat:26 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX1-NEXT: Cost Model: Found costs of RThru:36 CodeSize:40 Lat:40 SizeLat:52 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_funnel_i32'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:12 Lat:18 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_funnel_i32'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i32'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'splatconstant_funnel_i32'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:6 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_funnel_i32'
@@ -1392,23 +1371,23 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
;
; SLM-LABEL: 'splatconstant_funnel_i32'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; SLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; SLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; SLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_funnel_i32'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; GLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; GLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_funnel_i32'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:4 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
-; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:20 Lat:20 SizeLat:26 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
-; XOP-NEXT: Cost Model: Found costs of RThru:36 CodeSize:40 Lat:40 SizeLat:52 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> splat (i32 5))
+; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> splat (i32 5))
+; XOP-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> splat (i32 5))
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
@@ -1426,53 +1405,46 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
}
define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, i16 %b16, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; SSSE3-LABEL: 'splatconstant_funnel_i16'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:8 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:13 SizeLat:15 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:29 Lat:25 SizeLat:29 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'splatconstant_funnel_i16'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'splatconstant_funnel_i16'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_funnel_i16'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:20 SizeLat:27 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:40 SizeLat:54 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_funnel_i16'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:12 Lat:18 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_funnel_i16'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; AVX512F-NEXT: Cost Model: Found costs of 6 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:18 SizeLat:14 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:19 CodeSize:25 Lat:37 SizeLat:29 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:15 SizeLat:9 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:21 SizeLat:15 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i16'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX512BW-NEXT: Cost Model: Found costs of 6 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX512BW-NEXT: Cost Model: Found costs of 3 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'splatconstant_funnel_i16'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; AVX512DQ-NEXT: Cost Model: Found costs of 6 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:12 Lat:18 SizeLat:14 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:19 CodeSize:25 Lat:37 SizeLat:29 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:15 SizeLat:9 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:21 SizeLat:15 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_funnel_i16'
@@ -1484,23 +1456,23 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
;
; SLM-LABEL: 'splatconstant_funnel_i16'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; SLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:8 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:11 Lat:17 SizeLat:15 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:21 Lat:33 SizeLat:29 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_funnel_i16'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; GLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:7 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:11 Lat:13 SizeLat:13 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:21 Lat:25 SizeLat:25 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_funnel_i16'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:9 SizeLat:7 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
-; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:20 SizeLat:27 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
-; XOP-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:40 SizeLat:54 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> splat (i16 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> splat (i16 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> splat (i16 3))
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
@@ -1518,88 +1490,81 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
}
define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, i8 %b8, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; SSSE3-LABEL: 'splatconstant_funnel_i8'
-; SSSE3-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:7 CodeSize:10 Lat:19 SizeLat:12 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:14 CodeSize:19 Lat:37 SizeLat:23 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:37 Lat:73 SizeLat:45 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
-; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
-;
-; SSE42-LABEL: 'splatconstant_funnel_i8'
-; SSE42-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; SSE42-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:19 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:37 SizeLat:21 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:28 CodeSize:29 Lat:73 SizeLat:41 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
-; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; SSE-LABEL: 'splatconstant_funnel_i8'
+; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; SSE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_funnel_i8'
; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:8 Lat:19 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:28 Lat:22 SizeLat:33 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:40 CodeSize:56 Lat:44 SizeLat:66 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:15 Lat:15 SizeLat:18 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:30 Lat:30 SizeLat:36 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_funnel_i8'
; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:21 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:8 Lat:21 SizeLat:16 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:16 Lat:42 SizeLat:32 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:34 SizeLat:20 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_funnel_i8'
; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:20 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:25 Lat:41 SizeLat:27 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:11 Lat:25 SizeLat:13 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_funnel_i8'
; AVX512BW-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:18 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512BW-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512BW-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512BW-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512DQ-LABEL: 'splatconstant_funnel_i8'
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:20 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:25 Lat:41 SizeLat:27 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:11 Lat:25 SizeLat:13 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_funnel_i8'
; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:18 SizeLat:10 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:8 Lat:20 SizeLat:10 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512VBMI2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SLM-LABEL: 'splatconstant_funnel_i8'
; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; SLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:8 Lat:21 SizeLat:12 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:15 Lat:41 SizeLat:23 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:29 Lat:81 SizeLat:45 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_funnel_i8'
; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; GLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:8 Lat:19 SizeLat:11 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:15 Lat:37 SizeLat:21 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:29 Lat:73 SizeLat:41 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_funnel_i8'
; XOP-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; XOP-NEXT: Cost Model: Found costs of RThru:7 CodeSize:6 Lat:11 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; XOP-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:22 SizeLat:29 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; XOP-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:44 SizeLat:58 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:7 SizeLat:3 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:15 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; XOP-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:30 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; XOP-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:5 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:8 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
-; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:8 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:13 SizeLat:5 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> splat (i8 3))
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:13 SizeLat:5 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> splat (i8 3))
+; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:13 SizeLat:5 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> splat (i8 3))
; AVX512GFNI-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
%I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
@@ -2293,23 +2258,23 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
; SSE-LABEL: 'constant_rotate_i64'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; SSE-NEXT: Cost Model: Found costs of RThru:10 CodeSize:12 Lat:15 SizeLat:17 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; SSE-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:30 SizeLat:34 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SSE-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:60 SizeLat:68 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; SSE-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SSE-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_rotate_i64'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:10 SizeLat:14 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:17 CodeSize:28 Lat:17 SizeLat:38 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:34 CodeSize:56 Lat:34 SizeLat:76 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:9 SizeLat:13 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:23 Lat:15 SizeLat:32 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:26 CodeSize:46 Lat:30 SizeLat:64 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_rotate_i64'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:8 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:10 SizeLat:8 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:20 CodeSize:8 Lat:20 SizeLat:16 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:3 Lat:9 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:6 Lat:18 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'constant_rotate_i64'
@@ -2321,16 +2286,16 @@ define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
;
; SLM-LABEL: 'constant_rotate_i64'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:13 CodeSize:12 Lat:15 SizeLat:17 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:26 CodeSize:24 Lat:30 SizeLat:34 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SLM-NEXT: Cost Model: Found costs of RThru:52 CodeSize:48 Lat:60 SizeLat:68 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_rotate_i64'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:10 CodeSize:12 Lat:15 SizeLat:17 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:30 SizeLat:34 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; GLM-NEXT: Cost Model: Found costs of RThru:40 CodeSize:48 Lat:60 SizeLat:68 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT: Cost Model: Found costs of RThru:9 CodeSize:11 Lat:13 SizeLat:15 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:18 CodeSize:22 Lat:26 SizeLat:30 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; GLM-NEXT: Cost Model: Found costs of RThru:36 CodeSize:44 Lat:52 SizeLat:60 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_rotate_i64'
@@ -2350,30 +2315,30 @@ define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
; SSSE3-LABEL: 'constant_rotate_i32'
; SSSE3-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:20 CodeSize:24 Lat:22 SizeLat:28 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:40 CodeSize:47 Lat:43 SizeLat:55 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:80 CodeSize:93 Lat:85 SizeLat:109 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:19 CodeSize:23 Lat:21 SizeLat:27 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:38 CodeSize:46 Lat:42 SizeLat:54 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:76 CodeSize:92 Lat:84 SizeLat:108 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_rotate_i32'
; SSE42-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:20 CodeSize:18 Lat:30 SizeLat:22 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:40 CodeSize:35 Lat:59 SizeLat:43 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:80 CodeSize:69 Lat:117 SizeLat:85 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:19 CodeSize:17 Lat:29 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:38 CodeSize:34 Lat:58 SizeLat:42 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:76 CodeSize:68 Lat:116 SizeLat:84 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_rotate_i32'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:10 CodeSize:15 Lat:14 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:24 CodeSize:37 Lat:25 SizeLat:52 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:48 CodeSize:74 Lat:50 SizeLat:104 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:14 Lat:13 SizeLat:20 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:20 CodeSize:32 Lat:23 SizeLat:46 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:40 CodeSize:64 Lat:46 SizeLat:92 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_rotate_i32'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:8 SizeLat:8 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:10 SizeLat:10 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:20 CodeSize:8 Lat:20 SizeLat:20 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:7 SizeLat:7 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:9 CodeSize:3 Lat:9 SizeLat:8 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:18 CodeSize:6 Lat:18 SizeLat:16 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'constant_rotate_i32'
@@ -2385,16 +2350,16 @@ define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
;
; SLM-LABEL: 'constant_rotate_i32'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:29 CodeSize:18 Lat:30 SizeLat:28 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:58 CodeSize:35 Lat:59 SizeLat:55 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; SLM-NEXT: Cost Model: Found costs of RThru:116 CodeSize:69 Lat:117 SizeLat:109 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:17 Lat:29 SizeLat:27 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:56 CodeSize:34 Lat:58 SizeLat:54 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; SLM-NEXT: Cost Model: Found costs of RThru:112 CodeSize:68 Lat:116 SizeLat:108 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_rotate_i32'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:20 CodeSize:18 Lat:30 SizeLat:22 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:40 CodeSize:35 Lat:59 SizeLat:43 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
-; GLM-NEXT: Cost Model: Found costs of RThru:80 CodeSize:69 Lat:117 SizeLat:85 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT: Cost Model: Found costs of RThru:19 CodeSize:17 Lat:29 SizeLat:21 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:38 CodeSize:34 Lat:58 SizeLat:42 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; GLM-NEXT: Cost Model: Found costs of RThru:76 CodeSize:68 Lat:116 SizeLat:84 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_rotate_i32'
@@ -2414,37 +2379,37 @@ define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
; SSSE3-LABEL: 'constant_rotate_i16'
; SSSE3-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:19 CodeSize:34 Lat:26 SizeLat:34 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:38 CodeSize:67 Lat:51 SizeLat:67 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:76 CodeSize:133 Lat:101 SizeLat:133 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:18 CodeSize:33 Lat:25 SizeLat:33 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:36 CodeSize:66 Lat:50 SizeLat:66 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:72 CodeSize:132 Lat:100 SizeLat:132 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_rotate_i16'
; SSE42-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:25 CodeSize:26 Lat:33 SizeLat:30 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:50 CodeSize:51 Lat:65 SizeLat:59 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:100 CodeSize:101 Lat:129 SizeLat:117 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:24 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:48 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:96 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_rotate_i16'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:16 CodeSize:17 Lat:23 SizeLat:25 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:37 CodeSize:42 Lat:41 SizeLat:62 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:74 CodeSize:84 Lat:82 SizeLat:124 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:15 CodeSize:16 Lat:22 SizeLat:24 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:39 SizeLat:56 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:66 CodeSize:74 Lat:78 SizeLat:112 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_rotate_i16'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:13 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:17 SizeLat:20 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:24 CodeSize:26 Lat:34 SizeLat:40 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:18 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:32 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_rotate_i16'
; AVX512F-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:13 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:17 SizeLat:19 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:26 CodeSize:28 Lat:40 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:17 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:23 CodeSize:23 Lat:33 SizeLat:31 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_rotate_i16'
@@ -2456,9 +2421,9 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
;
; AVX512DQ-LABEL: 'constant_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:18 SizeLat:13 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:12 CodeSize:13 Lat:17 SizeLat:19 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:26 CodeSize:28 Lat:40 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:7 CodeSize:7 Lat:17 SizeLat:12 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:12 Lat:16 SizeLat:17 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:23 CodeSize:23 Lat:33 SizeLat:31 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_rotate_i16'
@@ -2470,16 +2435,16 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
;
; SLM-LABEL: 'constant_rotate_i16'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:26 CodeSize:26 Lat:33 SizeLat:30 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:52 CodeSize:51 Lat:65 SizeLat:59 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; SLM-NEXT: Cost Model: Found costs of RThru:104 CodeSize:101 Lat:129 SizeLat:117 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:25 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:50 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; SLM-NEXT: Cost Model: Found costs of RThru:100 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_rotate_i16'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:25 CodeSize:26 Lat:33 SizeLat:30 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:50 CodeSize:51 Lat:65 SizeLat:59 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; GLM-NEXT: Cost Model: Found costs of RThru:100 CodeSize:101 Lat:129 SizeLat:117 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:24 CodeSize:25 Lat:32 SizeLat:29 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:48 CodeSize:50 Lat:64 SizeLat:58 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; GLM-NEXT: Cost Model: Found costs of RThru:96 CodeSize:100 Lat:128 SizeLat:116 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_rotate_i16'
@@ -2506,37 +2471,37 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
; SSSE3-LABEL: 'constant_rotate_i8'
; SSSE3-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:29 CodeSize:55 Lat:51 SizeLat:60 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:58 CodeSize:109 Lat:101 SizeLat:119 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSSE3-NEXT: Cost Model: Found costs of RThru:116 CodeSize:217 Lat:201 SizeLat:237 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:28 CodeSize:54 Lat:50 SizeLat:59 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:56 CodeSize:108 Lat:100 SizeLat:118 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSSE3-NEXT: Cost Model: Found costs of RThru:112 CodeSize:216 Lat:200 SizeLat:236 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SSSE3-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; SSE42-LABEL: 'constant_rotate_i8'
; SSE42-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
-; SSE42-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:53 SizeLat:48 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:66 CodeSize:73 Lat:105 SizeLat:95 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SSE42-NEXT: Cost Model: Found costs of RThru:132 CodeSize:145 Lat:209 SizeLat:189 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SSE42-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SSE42-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'constant_rotate_i8'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:23 CodeSize:25 Lat:50 SizeLat:37 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:50 CodeSize:63 Lat:48 SizeLat:91 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX1-NEXT: Cost Model: Found costs of RThru:100 CodeSize:126 Lat:96 SizeLat:182 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:24 Lat:49 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:46 CodeSize:58 Lat:46 SizeLat:85 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX1-NEXT: Cost Model: Found costs of RThru:92 CodeSize:116 Lat:92 SizeLat:170 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'constant_rotate_i8'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
-; AVX2-NEXT: Cost Model: Found costs of RThru:14 CodeSize:25 Lat:50 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:16 CodeSize:25 Lat:55 SizeLat:50 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX2-NEXT: Cost Model: Found costs of RThru:32 CodeSize:50 Lat:110 SizeLat:100 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:48 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX2-NEXT: Cost Model: Found costs of RThru:30 CodeSize:48 Lat:108 SizeLat:96 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'constant_rotate_i8'
; AVX512F-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:25 Lat:50 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:16 CodeSize:25 Lat:55 SizeLat:49 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:34 CodeSize:63 Lat:46 SizeLat:75 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512F-NEXT: Cost Model: Found costs of RThru:31 CodeSize:58 Lat:39 SizeLat:70 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'constant_rotate_i8'
@@ -2548,9 +2513,9 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
;
; AVX512DQ-LABEL: 'constant_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:25 Lat:50 SizeLat:36 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:16 CodeSize:25 Lat:55 SizeLat:49 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:34 CodeSize:63 Lat:46 SizeLat:75 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:13 CodeSize:24 Lat:49 SizeLat:35 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:15 CodeSize:24 Lat:54 SizeLat:47 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:31 CodeSize:58 Lat:39 SizeLat:70 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'constant_rotate_i8'
@@ -2562,16 +2527,16 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
;
; SLM-LABEL: 'constant_rotate_i8'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:53 SizeLat:48 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SLM-NEXT: Cost Model: Found costs of RThru:66 CodeSize:73 Lat:105 SizeLat:95 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; SLM-NEXT: Cost Model: Found costs of RThru:132 CodeSize:145 Lat:209 SizeLat:189 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; SLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'constant_rotate_i8'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:33 CodeSize:37 Lat:53 SizeLat:48 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; GLM-NEXT: Cost Model: Found costs of RThru:66 CodeSize:73 Lat:105 SizeLat:95 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; GLM-NEXT: Cost Model: Found costs of RThru:132 CodeSize:145 Lat:209 SizeLat:189 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:32 CodeSize:36 Lat:52 SizeLat:47 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:64 CodeSize:72 Lat:104 SizeLat:94 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; GLM-NEXT: Cost Model: Found costs of RThru:128 CodeSize:144 Lat:208 SizeLat:188 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'constant_rotate_i8'
@@ -2602,23 +2567,23 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
; SSE-LABEL: 'splatconstant_rotate_i64'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:5 SizeLat:5 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; SSE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:16 Lat:20 SizeLat:20 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_rotate_i64'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:14 Lat:15 SizeLat:18 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:28 Lat:30 SizeLat:36 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_rotate_i64'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; AVX2-NEXT: Cost Model: Found costs of 4 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:6 SizeLat:8 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:8 Lat:12 SizeLat:16 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'splatconstant_rotate_i64'
@@ -2630,16 +2595,16 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
;
; SLM-LABEL: 'splatconstant_rotate_i64'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; SLM-NEXT: Cost Model: Found costs of RThru:7 CodeSize:4 Lat:5 SizeLat:5 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; SLM-NEXT: Cost Model: Found costs of RThru:14 CodeSize:8 Lat:10 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; SLM-NEXT: Cost Model: Found costs of RThru:28 CodeSize:16 Lat:20 SizeLat:20 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_rotate_i64'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:5 SizeLat:5 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
-; GLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:8 Lat:10 SizeLat:10 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
-; GLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:16 Lat:20 SizeLat:20 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> splat (i64 7))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> splat (i64 7))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_rotate_i64'
@@ -2659,23 +2624,23 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
; SSE-LABEL: 'splatconstant_rotate_i32'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
-; SSE-NEXT: Cost Model: Found costs of 4 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; SSE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_rotate_i32'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
-; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:14 Lat:15 SizeLat:18 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:28 Lat:30 SizeLat:36 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_rotate_i32'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
-; AVX2-NEXT: Cost Model: Found costs of 4 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:6 SizeLat:8 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:8 Lat:12 SizeLat:16 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512-LABEL: 'splatconstant_rotate_i32'
@@ -2687,16 +2652,16 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
;
; SLM-LABEL: 'splatconstant_rotate_i32'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
-; SLM-NEXT: Cost Model: Found costs of 4 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; SLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_rotate_i32'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
-; GLM-NEXT: Cost Model: Found costs of 4 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
-; GLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
-; GLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> splat (i32 5))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> splat (i32 5))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_rotate_i32'
@@ -2716,30 +2681,30 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
; SSE-LABEL: 'splatconstant_rotate_i16'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; SSE-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; SSE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; SSE-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_rotate_i16'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; AVX1-NEXT: Cost Model: Found costs of RThru:4 CodeSize:4 Lat:6 SizeLat:4 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:11 CodeSize:14 Lat:15 SizeLat:18 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:22 CodeSize:28 Lat:30 SizeLat:36 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:3 CodeSize:3 Lat:5 SizeLat:3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:7 CodeSize:9 Lat:13 SizeLat:12 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:14 CodeSize:18 Lat:26 SizeLat:24 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_rotate_i16'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; AVX2-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:6 SizeLat:8 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:8 Lat:12 SizeLat:16 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:3 Lat:5 SizeLat:6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:12 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_rotate_i16'
; AVX512F-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512F-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:16 SizeLat:11 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:28 SizeLat:20 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:15 SizeLat:9 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:21 SizeLat:15 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_rotate_i16'
@@ -2751,9 +2716,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
;
; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512DQ-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:16 SizeLat:11 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:14 CodeSize:16 Lat:28 SizeLat:20 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:9 Lat:15 SizeLat:9 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:11 CodeSize:11 Lat:21 SizeLat:15 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_rotate_i16'
@@ -2765,16 +2730,16 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
;
; SLM-LABEL: 'splatconstant_rotate_i16'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; SLM-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; SLM-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_rotate_i16'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; GLM-NEXT: Cost Model: Found costs of 4 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:7 Lat:7 SizeLat:7 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:13 Lat:13 SizeLat:13 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 3 for: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 6 for: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> splat (i16 3))
+; GLM-NEXT: Cost Model: Found costs of 12 for: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> splat (i16 3))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_rotate_i16'
@@ -2801,30 +2766,30 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
; SSE-LABEL: 'splatconstant_rotate_i8'
; SSE-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; SSE-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; SSE-NEXT: Cost Model: Found costs of RThru:8 CodeSize:11 Lat:31 SizeLat:15 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; SSE-NEXT: Cost Model: Found costs of RThru:16 CodeSize:21 Lat:61 SizeLat:29 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; SSE-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; SSE-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX1-LABEL: 'splatconstant_rotate_i8'
; AVX1-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX1-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:13 CodeSize:20 Lat:17 SizeLat:24 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; AVX1-NEXT: Cost Model: Found costs of RThru:26 CodeSize:40 Lat:34 SizeLat:48 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:9 CodeSize:15 Lat:15 SizeLat:18 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; AVX1-NEXT: Cost Model: Found costs of RThru:18 CodeSize:30 Lat:30 SizeLat:36 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; AVX1-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX2-LABEL: 'splatconstant_rotate_i8'
; AVX2-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX2-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:18 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:12 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; AVX2-NEXT: Cost Model: Found costs of RThru:12 CodeSize:12 Lat:36 SizeLat:24 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:10 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; AVX2-NEXT: Cost Model: Found costs of RThru:10 CodeSize:10 Lat:34 SizeLat:20 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; AVX2-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512F-LABEL: 'splatconstant_rotate_i8'
; AVX512F-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512F-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:18 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:11 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; AVX512F-NEXT: Cost Model: Found costs of RThru:8 CodeSize:16 Lat:32 SizeLat:18 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; AVX512F-NEXT: Cost Model: Found costs of RThru:5 CodeSize:11 Lat:25 SizeLat:13 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; AVX512F-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512BW-LABEL: 'splatconstant_rotate_i8'
@@ -2836,9 +2801,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
;
; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
; AVX512DQ-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:18 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:6 CodeSize:6 Lat:18 SizeLat:11 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; AVX512DQ-NEXT: Cost Model: Found costs of RThru:8 CodeSize:16 Lat:32 SizeLat:18 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:17 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:5 Lat:17 SizeLat:9 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; AVX512DQ-NEXT: Cost Model: Found costs of RThru:5 CodeSize:11 Lat:25 SizeLat:13 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; AVX512DQ-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
@@ -2850,16 +2815,16 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
;
; SLM-LABEL: 'splatconstant_rotate_i8'
; SLM-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; SLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:11 Lat:31 SizeLat:15 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; SLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:21 Lat:61 SizeLat:29 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; SLM-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; SLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; GLM-LABEL: 'splatconstant_rotate_i8'
; GLM-NEXT: Cost Model: Found costs of 1 for: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; GLM-NEXT: Cost Model: Found costs of RThru:4 CodeSize:6 Lat:16 SizeLat:8 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:8 CodeSize:11 Lat:31 SizeLat:15 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
-; GLM-NEXT: Cost Model: Found costs of RThru:16 CodeSize:21 Lat:61 SizeLat:29 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:3 CodeSize:5 Lat:15 SizeLat:7 for: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:6 CodeSize:10 Lat:30 SizeLat:14 for: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> splat (i8 3))
+; GLM-NEXT: Cost Model: Found costs of RThru:12 CodeSize:20 Lat:60 SizeLat:28 for: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> splat (i8 3))
; GLM-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
; XOP-LABEL: 'splatconstant_rotate_i8'
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/funnel-shift-cost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/funnel-shift-cost.ll
index 47759b2a3c4aa..b214b9422b79e 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/funnel-shift-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/funnel-shift-cost.ll
@@ -1,27 +1,54 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=riscv64 -mattr=+v -passes=slp-vectorizer -S < %s | FileCheck %s
+; The four fshl.i16 calls use a constant shift amount (1), so the scalar cost
+; of each is 3 (Or + Shl + LShr only; Sub, modulo, ICmp and Select are not
+; needed for constant shift amounts). Total scalar fshl cost = 4 x 3 = 12.
+;
+; SLP considers vectorizing the fshl+store bundle to <4 x i16>:
+; - fshl bundle: VectorCost=7 ScalarCost=12 net=-5
+; - store bundle: VectorCost=1 ScalarCost=4 net=-3
+; - right-input gather (non-contiguous phi values): +3
+; Tree total cost = -5
+;
+; However, the four fshl results are also consumed by scalar add/sub in
+; use.results, requiring element extractions from the vector:
+; ExtractElement cost = 1+2+2+2 = 7
+;
+; Total cost = -5 + 7 = 2 > 0, so SLP correctly decides not to vectorize.
+;
+; Before the fix, Sub/ICmp/Select were always included in the scalar fshl cost
+; even for constant shifts, giving ScalarCost=24 for the bundle (net=-17),
+; which overwhelmed the extract cost (total=-10) and caused incorrect
+; vectorization.
+
declare i16 @llvm.fshl.i16(i16, i16, i16)
define void @foo(i16 %lx3, ptr %extra_bits, i16 %init_count) {
; CHECK-LABEL: define void @foo(
; CHECK-SAME: i16 [[LX3:%.*]], ptr [[EXTRA_BITS:%.*]], i16 [[INIT_COUNT:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[EB1_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 2
+; CHECK-NEXT: [[EB2_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 4
+; CHECK-NEXT: [[EB3_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[EXTRA_BITS]], i64 6
; CHECK-NEXT: br label %[[WHILE_BODY:.*]]
; CHECK: [[WHILE_BODY]]:
-; CHECK-NEXT: [[CTR:%.*]] = phi i16 [ [[INIT_COUNT]], %[[ENTRY]] ], [ [[CTR_DEC:%.*]], %[[USE_RESULTS:.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP3:%.*]], %[[USE_RESULTS]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 poison>
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[LX3]], i32 3
-; CHECK-NEXT: [[TMP3]] = call <4 x i16> @llvm.fshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP2]], <4 x i16> splat (i16 1))
-; CHECK-NEXT: store <4 x i16> [[TMP3]], ptr [[EXTRA_BITS]], align 2
+; CHECK-NEXT: [[EB0:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP4:%.*]], %[[USE_RESULTS:.*]] ]
+; CHECK-NEXT: [[EB1:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP5:%.*]], %[[USE_RESULTS]] ]
+; CHECK-NEXT: [[EB2:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP6:%.*]], %[[USE_RESULTS]] ]
+; CHECK-NEXT: [[EB3:%.*]] = phi i16 [ 0, %[[ENTRY]] ], [ [[TMP7:%.*]], %[[USE_RESULTS]] ]
+; CHECK-NEXT: [[CTR:%.*]] = phi i16 [ [[INIT_COUNT]], %[[ENTRY]] ], [ [[CTR_DEC:%.*]], %[[USE_RESULTS]] ]
+; CHECK-NEXT: [[TMP7]] = tail call i16 @llvm.fshl.i16(i16 [[EB3]], i16 [[LX3]], i16 1)
+; CHECK-NEXT: store i16 [[TMP7]], ptr [[EB3_PTR]], align 2
+; CHECK-NEXT: [[TMP6]] = tail call i16 @llvm.fshl.i16(i16 [[EB2]], i16 [[EB3]], i16 1)
+; CHECK-NEXT: store i16 [[TMP6]], ptr [[EB2_PTR]], align 2
+; CHECK-NEXT: [[TMP5]] = tail call i16 @llvm.fshl.i16(i16 [[EB1]], i16 [[EB2]], i16 1)
+; CHECK-NEXT: store i16 [[TMP5]], ptr [[EB1_PTR]], align 2
+; CHECK-NEXT: [[TMP4]] = tail call i16 @llvm.fshl.i16(i16 [[EB0]], i16 [[EB1]], i16 1)
+; CHECK-NEXT: store i16 [[TMP4]], ptr [[EXTRA_BITS]], align 2
; CHECK-NEXT: br label %[[USE_RESULTS]]
; CHECK: [[USE_RESULTS]]:
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
; CHECK-NEXT: [[SUM01:%.*]] = add i16 [[TMP4]], [[TMP5]]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i16> [[TMP3]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
; CHECK-NEXT: [[SUM23:%.*]] = sub i16 [[TMP6]], [[TMP7]]
; CHECK-NEXT: [[SUM:%.*]] = add i16 [[SUM01]], [[SUM23]]
; CHECK-NEXT: store i16 [[SUM]], ptr [[EXTRA_BITS]], align 2
More information about the llvm-commits
mailing list