[llvm] c7bb5ac - [NFC] Renamed /test/Analysis/CostModel/X86/splat-load.ll test and added more checks.
Vasileios Porpodas via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 27 09:48:08 PDT 2022
Author: Vasileios Porpodas
Date: 2022-04-27T09:47:43-07:00
New Revision: c7bb5ac5ca1b7eeb4b8e00a4f59e90520a44e5f7
URL: https://github.com/llvm/llvm-project/commit/c7bb5ac5ca1b7eeb4b8e00a4f59e90520a44e5f7
DIFF: https://github.com/llvm/llvm-project/commit/c7bb5ac5ca1b7eeb4b8e00a4f59e90520a44e5f7.diff
LOG: [NFC] Renamed /test/Analysis/CostModel/X86/splat-load.ll test and added more checks.
Renamed test/Analysis/CostModel/X86/splat-load.ll to shuffle-load.ll
to align it with AArch64's similar test.
Also added a complete list of checks for all vector combinations up to 512-bits.
Differential Revision: https://reviews.llvm.org/D124528
Added:
llvm/test/Analysis/CostModel/X86/shuffle-load.ll
Modified:
Removed:
llvm/test/Analysis/CostModel/X86/splat-load.ll
################################################################################
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-load.ll b/llvm/test/Analysis/CostModel/X86/shuffle-load.ll
new file mode 100644
index 000000000000..59c371dbdaa8
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-load.ll
@@ -0,0 +1,473 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s -check-prefixes=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s -check-prefixes=SSE3
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s -check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s -check-prefixes=AVX512
+
+; This test checks that the cost of a splat-load shuffle is correctly detected.
+; If there is a combined load+broadcast instruction, like `movddup` it should
+; return 0.
+;
+; TODO: AVX `vbroadcast*` seems to support more types than the
+; 2xdouble type of `movddup`:
+; - `vbroadcastss` supports 4xfloat, 8xfloat
+; - `vbroadcastsd` supports 4xdouble
+
+; NOTE: The code in this test is a hack. Since TTI cannot currently detect a
+; proper broadcast pattern from a scalar load (like the one that follows),
+; we use a vector load as the shuffle's operand to trigger the pattern.
+;
+; %load = load double, double *%ptr
+; %insert = insertelement <2 x double> poison, double %load, i32 0
+; %bcast = shufflevector <2 x double> %insert, <2 x double> poison, <2 x i32> zeroinitializer
+
+define void @shuffle_load() {
+; SSE-LABEL: 'shuffle_load'
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi8 = load <2 x i8>, ptr undef, align 2
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi8 = shufflevector <2 x i8> %ld_2xi8, <2 x i8> undef, <2 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi8 = load <4 x i8>, ptr undef, align 4
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_4xi8 = shufflevector <4 x i8> %ld_4xi8, <4 x i8> undef, <4 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
+; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
+; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
+; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi32 = shufflevector <4 x i32> %ld_4xi32, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xi32 = load <8 x i32>, ptr undef, align 32
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi32 = shufflevector <8 x i32> %ld_8xi32, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xi32 = load <16 x i32>, ptr undef, align 64
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xi32 = shufflevector <16 x i32> %ld_16xi32, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi64 = load <2 x i64>, ptr undef, align 16
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi64 = shufflevector <2 x i64> %ld_2xi64, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xi64 = load <4 x i64>, ptr undef, align 32
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi64 = shufflevector <4 x i64> %ld_4xi64, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xi64 = load <8 x i64>, ptr undef, align 64
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi64 = shufflevector <8 x i64> %ld_8xi64, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_2xf16 = load <2 x half>, ptr undef, align 4
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xf32 = shufflevector <4 x float> %ld_4xf32, <4 x float> undef, <4 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xf32 = load <8 x float>, ptr undef, align 32
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xf32 = shufflevector <8 x float> %ld_8xf32, <8 x float> undef, <8 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xf32 = load <16 x float>, ptr undef, align 64
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xf32 = shufflevector <16 x float> %ld_16xf32, <16 x float> undef, <16 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xf64 = load <4 x double>, ptr undef, align 32
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xf64 = shufflevector <4 x double> %ld_4xf64, <4 x double> undef, <4 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xf64 = load <8 x double>, ptr undef, align 64
+; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xf64 = shufflevector <8 x double> %ld_8xf64, <8 x double> undef, <8 x i32> zeroinitializer
+; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE2-LABEL: 'shuffle_load'
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi8 = load <2 x i8>, ptr undef, align 2
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi8 = shufflevector <2 x i8> %ld_2xi8, <2 x i8> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi8 = load <4 x i8>, ptr undef, align 4
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_4xi8 = shufflevector <4 x i8> %ld_4xi8, <4 x i8> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi32 = shufflevector <4 x i32> %ld_4xi32, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xi32 = load <8 x i32>, ptr undef, align 32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi32 = shufflevector <8 x i32> %ld_8xi32, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xi32 = load <16 x i32>, ptr undef, align 64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xi32 = shufflevector <16 x i32> %ld_16xi32, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi64 = load <2 x i64>, ptr undef, align 16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi64 = shufflevector <2 x i64> %ld_2xi64, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xi64 = load <4 x i64>, ptr undef, align 32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi64 = shufflevector <4 x i64> %ld_4xi64, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xi64 = load <8 x i64>, ptr undef, align 64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi64 = shufflevector <8 x i64> %ld_8xi64, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_2xf16 = load <2 x half>, ptr undef, align 4
+; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xf32 = shufflevector <4 x float> %ld_4xf32, <4 x float> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xf32 = load <8 x float>, ptr undef, align 32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xf32 = shufflevector <8 x float> %ld_8xf32, <8 x float> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xf32 = load <16 x float>, ptr undef, align 64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xf32 = shufflevector <16 x float> %ld_16xf32, <16 x float> undef, <16 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf64 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xf64 = load <4 x double>, ptr undef, align 32
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xf64 = shufflevector <4 x double> %ld_4xf64, <4 x double> undef, <4 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xf64 = load <8 x double>, ptr undef, align 64
+; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xf64 = shufflevector <8 x double> %ld_8xf64, <8 x double> undef, <8 x i32> zeroinitializer
+; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE3-LABEL: 'shuffle_load'
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi8 = load <2 x i8>, ptr undef, align 2
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi8 = shufflevector <2 x i8> %ld_2xi8, <2 x i8> undef, <2 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi8 = load <4 x i8>, ptr undef, align 4
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_4xi8 = shufflevector <4 x i8> %ld_4xi8, <4 x i8> undef, <4 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
+; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
+; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
+; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi32 = shufflevector <4 x i32> %ld_4xi32, <4 x i32> undef, <4 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xi32 = load <8 x i32>, ptr undef, align 32
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi32 = shufflevector <8 x i32> %ld_8xi32, <8 x i32> undef, <8 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xi32 = load <16 x i32>, ptr undef, align 64
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xi32 = shufflevector <16 x i32> %ld_16xi32, <16 x i32> undef, <16 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi64 = load <2 x i64>, ptr undef, align 16
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi64 = shufflevector <2 x i64> %ld_2xi64, <2 x i64> undef, <2 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xi64 = load <4 x i64>, ptr undef, align 32
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi64 = shufflevector <4 x i64> %ld_4xi64, <4 x i64> undef, <4 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xi64 = load <8 x i64>, ptr undef, align 64
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi64 = shufflevector <8 x i64> %ld_8xi64, <8 x i64> undef, <8 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_2xf16 = load <2 x half>, ptr undef, align 4
+; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8
+; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16
+; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32
+; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64
+; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xf32 = shufflevector <4 x float> %ld_4xf32, <4 x float> undef, <4 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xf32 = load <8 x float>, ptr undef, align 32
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xf32 = shufflevector <8 x float> %ld_8xf32, <8 x float> undef, <8 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xf32 = load <16 x float>, ptr undef, align 64
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xf32 = shufflevector <16 x float> %ld_16xf32, <16 x float> undef, <16 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16
+; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf64 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xf64 = load <4 x double>, ptr undef, align 32
+; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf64 = shufflevector <4 x double> %ld_4xf64, <4 x double> undef, <4 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xf64 = load <8 x double>, ptr undef, align 64
+; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf64 = shufflevector <8 x double> %ld_8xf64, <8 x double> undef, <8 x i32> zeroinitializer
+; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'shuffle_load'
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi8 = load <2 x i8>, ptr undef, align 2
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi8 = shufflevector <2 x i8> %ld_2xi8, <2 x i8> undef, <2 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi8 = load <4 x i8>, ptr undef, align 4
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_4xi8 = shufflevector <4 x i8> %ld_4xi8, <4 x i8> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
+; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
+; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
+; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi32 = shufflevector <4 x i32> %ld_4xi32, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xi32 = load <8 x i32>, ptr undef, align 32
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi32 = shufflevector <8 x i32> %ld_8xi32, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xi32 = load <16 x i32>, ptr undef, align 64
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xi32 = shufflevector <16 x i32> %ld_16xi32, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi64 = load <2 x i64>, ptr undef, align 16
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi64 = shufflevector <2 x i64> %ld_2xi64, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xi64 = load <4 x i64>, ptr undef, align 32
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi64 = shufflevector <4 x i64> %ld_4xi64, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xi64 = load <8 x i64>, ptr undef, align 64
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi64 = shufflevector <8 x i64> %ld_8xi64, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_2xf16 = load <2 x half>, ptr undef, align 4
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xf32 = shufflevector <4 x float> %ld_4xf32, <4 x float> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xf32 = load <8 x float>, ptr undef, align 32
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xf32 = shufflevector <8 x float> %ld_8xf32, <8 x float> undef, <8 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xf32 = load <16 x float>, ptr undef, align 64
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xf32 = shufflevector <16 x float> %ld_16xf32, <16 x float> undef, <16 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf64 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xf64 = load <4 x double>, ptr undef, align 32
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf64 = shufflevector <4 x double> %ld_4xf64, <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xf64 = load <8 x double>, ptr undef, align 64
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf64 = shufflevector <8 x double> %ld_8xf64, <8 x double> undef, <8 x i32> zeroinitializer
+; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'shuffle_load'
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi8 = load <2 x i8>, ptr undef, align 2
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi8 = shufflevector <2 x i8> %ld_2xi8, <2 x i8> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi8 = load <4 x i8>, ptr undef, align 4
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_4xi8 = shufflevector <4 x i8> %ld_4xi8, <4 x i8> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
+; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi32 = shufflevector <4 x i32> %ld_4xi32, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xi32 = load <8 x i32>, ptr undef, align 32
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi32 = shufflevector <8 x i32> %ld_8xi32, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xi32 = load <16 x i32>, ptr undef, align 64
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xi32 = shufflevector <16 x i32> %ld_16xi32, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi64 = load <2 x i64>, ptr undef, align 16
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi64 = shufflevector <2 x i64> %ld_2xi64, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xi64 = load <4 x i64>, ptr undef, align 32
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi64 = shufflevector <4 x i64> %ld_4xi64, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xi64 = load <8 x i64>, ptr undef, align 64
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi64 = shufflevector <8 x i64> %ld_8xi64, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_2xf16 = load <2 x half>, ptr undef, align 4
+; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8
+; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16
+; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32
+; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64
+; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xf32 = shufflevector <4 x float> %ld_4xf32, <4 x float> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xf32 = load <8 x float>, ptr undef, align 32
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xf32 = shufflevector <8 x float> %ld_8xf32, <8 x float> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xf32 = load <16 x float>, ptr undef, align 64
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xf32 = shufflevector <16 x float> %ld_16xf32, <16 x float> undef, <16 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16
+; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf64 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xf64 = load <4 x double>, ptr undef, align 32
+; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf64 = shufflevector <4 x double> %ld_4xf64, <4 x double> undef, <4 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xf64 = load <8 x double>, ptr undef, align 64
+; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf64 = shufflevector <8 x double> %ld_8xf64, <8 x double> undef, <8 x i32> zeroinitializer
+; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512-LABEL: 'shuffle_load'
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi8 = load <2 x i8>, ptr undef, align 2
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi8 = shufflevector <2 x i8> %ld_2xi8, <2 x i8> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi8 = load <4 x i8>, ptr undef, align 4
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_4xi8 = shufflevector <4 x i8> %ld_4xi8, <4 x i8> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi8 = load <8 x i8>, ptr undef, align 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_16xi8 = load <16 x i8>, ptr undef, align 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_32xi8 = load <32 x i8>, ptr undef, align 32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_64xi8 = load <64 x i8>, ptr undef, align 64
+; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi16 = load <2 x i16>, ptr undef, align 4
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi16 = load <4 x i16>, ptr undef, align 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_8xi16 = load <8 x i16>, ptr undef, align 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_16xi16 = load <16 x i16>, ptr undef, align 32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_32xi16 = load <32 x i16>, ptr undef, align 64
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi32 = load <2 x i32>, ptr undef, align 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xi32 = load <4 x i32>, ptr undef, align 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi32 = shufflevector <4 x i32> %ld_4xi32, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xi32 = load <8 x i32>, ptr undef, align 32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi32 = shufflevector <8 x i32> %ld_8xi32, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xi32 = load <16 x i32>, ptr undef, align 64
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xi32 = shufflevector <16 x i32> %ld_16xi32, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xi64 = load <2 x i64>, ptr undef, align 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xi64 = shufflevector <2 x i64> %ld_2xi64, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xi64 = load <4 x i64>, ptr undef, align 32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xi64 = shufflevector <4 x i64> %ld_4xi64, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xi64 = load <8 x i64>, ptr undef, align 64
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xi64 = shufflevector <8 x i64> %ld_8xi64, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_2xf16 = load <2 x half>, ptr undef, align 4
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_4xf16 = load <4 x half>, ptr undef, align 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ld_8xf16 = load <8 x half>, ptr undef, align 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %ld_16xf16 = load <16 x half>, ptr undef, align 32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %ld_32xf16 = load <32 x half>, ptr undef, align 64
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf32 = load <2 x float>, ptr undef, align 8
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_4xf32 = load <4 x float>, ptr undef, align 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_4xf32 = shufflevector <4 x float> %ld_4xf32, <4 x float> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_8xf32 = load <8 x float>, ptr undef, align 32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_8xf32 = shufflevector <8 x float> %ld_8xf32, <8 x float> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_16xf32 = load <16 x float>, ptr undef, align 64
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %sf_16xf32 = shufflevector <16 x float> %ld_16xf32, <16 x float> undef, <16 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ld_2xf64 = load <2 x double>, ptr undef, align 16
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_2xf64 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ld_4xf64 = load <4 x double>, ptr undef, align 32
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_4xf64 = shufflevector <4 x double> %ld_4xf64, <4 x double> undef, <4 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ld_8xf64 = load <8 x double>, ptr undef, align 64
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %sf_8xf64 = shufflevector <8 x double> %ld_8xf64, <8 x double> undef, <8 x i32> zeroinitializer
+; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+ %ld_2xi8 = load <2 x i8>, ptr undef
+ %sf_2xi8 = shufflevector <2 x i8> %ld_2xi8, <2 x i8> undef, <2 x i32> zeroinitializer
+ %ld_4xi8 = load <4 x i8>, ptr undef
+ %sf_4xi8 = shufflevector <4 x i8> %ld_4xi8, <4 x i8> undef, <4 x i32> zeroinitializer
+ %ld_8xi8 = load <8 x i8>, ptr undef
+ %sf_8xi8 = shufflevector <8 x i8> %ld_8xi8, <8 x i8> undef, <8 x i32> zeroinitializer
+ %ld_16xi8 = load <16 x i8>, ptr undef
+ %sf_16xi8 = shufflevector <16 x i8> %ld_16xi8, <16 x i8> undef, <16 x i32> zeroinitializer
+ %ld_32xi8 = load <32 x i8>, ptr undef
+ %sf_32xi8 = shufflevector <32 x i8> %ld_32xi8, <32 x i8> undef, <32 x i32> zeroinitializer
+ %ld_64xi8 = load <64 x i8>, ptr undef
+ %sf_64xi8 = shufflevector <64 x i8> %ld_64xi8, <64 x i8> undef, <64 x i32> zeroinitializer
+
+ %ld_2xi16 = load <2 x i16>, ptr undef
+ %sf_2xi16 = shufflevector <2 x i16> %ld_2xi16, <2 x i16> undef, <2 x i32> zeroinitializer
+ %ld_4xi16 = load <4 x i16>, ptr undef
+ %sf_4xi16 = shufflevector <4 x i16> %ld_4xi16, <4 x i16> undef, <4 x i32> zeroinitializer
+ %ld_8xi16 = load <8 x i16>, ptr undef
+ %sf_8xi16 = shufflevector <8 x i16> %ld_8xi16, <8 x i16> undef, <8 x i32> zeroinitializer
+ %ld_16xi16 = load <16 x i16>, ptr undef
+ %sf_16xi16 = shufflevector <16 x i16> %ld_16xi16, <16 x i16> undef, <16 x i32> zeroinitializer
+ %ld_32xi16 = load <32 x i16>, ptr undef
+ %sf_32xi16 = shufflevector <32 x i16> %ld_32xi16, <32 x i16> undef, <32 x i32> zeroinitializer
+
+ %ld_2xi32 = load <2 x i32>, ptr undef
+ %sf_2xi32 = shufflevector <2 x i32> %ld_2xi32, <2 x i32> undef, <2 x i32> zeroinitializer
+ %ld_4xi32 = load <4 x i32>, ptr undef
+ %sf_4xi32 = shufflevector <4 x i32> %ld_4xi32, <4 x i32> undef, <4 x i32> zeroinitializer
+ %ld_8xi32 = load <8 x i32>, ptr undef
+ %sf_8xi32 = shufflevector <8 x i32> %ld_8xi32, <8 x i32> undef, <8 x i32> zeroinitializer
+ %ld_16xi32 = load <16 x i32>, ptr undef
+ %sf_16xi32 = shufflevector <16 x i32> %ld_16xi32, <16 x i32> undef, <16 x i32> zeroinitializer
+
+ %ld_2xi64 = load <2 x i64>, ptr undef
+ %sf_2xi64 = shufflevector <2 x i64> %ld_2xi64, <2 x i64> undef, <2 x i32> zeroinitializer
+ %ld_4xi64 = load <4 x i64>, ptr undef
+ %sf_4xi64 = shufflevector <4 x i64> %ld_4xi64, <4 x i64> undef, <4 x i32> zeroinitializer
+ %ld_8xi64 = load <8 x i64>, ptr undef
+ %sf_8xi64 = shufflevector <8 x i64> %ld_8xi64, <8 x i64> undef, <8 x i32> zeroinitializer
+
+ %ld_2xf16 = load <2 x half>, ptr undef
+ %sf_2xf16 = shufflevector <2 x half> %ld_2xf16, <2 x half> undef, <2 x i32> zeroinitializer
+ %ld_4xf16 = load <4 x half>, ptr undef
+ %sf_4xf16 = shufflevector <4 x half> %ld_4xf16, <4 x half> undef, <4 x i32> zeroinitializer
+ %ld_8xf16 = load <8 x half>, ptr undef
+ %sf_8xf16 = shufflevector <8 x half> %ld_8xf16, <8 x half> undef, <8 x i32> zeroinitializer
+ %ld_16xf16 = load <16 x half>, ptr undef
+ %sf_16xf16 = shufflevector <16 x half> %ld_16xf16, <16 x half> undef, <16 x i32> zeroinitializer
+ %ld_32xf16 = load <32 x half>, ptr undef
+ %sf_32xf16 = shufflevector <32 x half> %ld_32xf16, <32 x half> undef, <32 x i32> zeroinitializer
+
+ %ld_2xf32 = load <2 x float>, ptr undef
+ %sf_2xf32 = shufflevector <2 x float> %ld_2xf32, <2 x float> undef, <2 x i32> zeroinitializer
+ %ld_4xf32 = load <4 x float>, ptr undef
+ %sf_4xf32 = shufflevector <4 x float> %ld_4xf32, <4 x float> undef, <4 x i32> zeroinitializer
+ %ld_8xf32 = load <8 x float>, ptr undef
+ %sf_8xf32 = shufflevector <8 x float> %ld_8xf32, <8 x float> undef, <8 x i32> zeroinitializer
+ %ld_16xf32 = load <16 x float>, ptr undef
+ %sf_16xf32 = shufflevector <16 x float> %ld_16xf32, <16 x float> undef, <16 x i32> zeroinitializer
+
+ %ld_2xf64 = load <2 x double>, ptr undef
+ %sf_2xf64 = shufflevector <2 x double> %ld_2xf64, <2 x double> undef, <2 x i32> zeroinitializer
+ %ld_4xf64 = load <4 x double>, ptr undef
+ %sf_4xf64 = shufflevector <4 x double> %ld_4xf64, <4 x double> undef, <4 x i32> zeroinitializer
+ %ld_8xf64 = load <8 x double>, ptr undef
+ %sf_8xf64 = shufflevector <8 x double> %ld_8xf64, <8 x double> undef, <8 x i32> zeroinitializer
+
+ ret void
+}
diff --git a/llvm/test/Analysis/CostModel/X86/splat-load.ll b/llvm/test/Analysis/CostModel/X86/splat-load.ll
deleted file mode 100644
index a0a57579c421..000000000000
--- a/llvm/test/Analysis/CostModel/X86/splat-load.ll
+++ /dev/null
@@ -1,51 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s -check-prefixes=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s -check-prefixes=SSE3
-
-; This test checks that the cost of a splat-load shuffle is correctly detected
-; as 0, because the combined load + broadcast is lowered to a `movddup` instr.
-;
-; TODO: AVX `vbroadcast*` seems to support more types than the
-; 2xdouble type of `movddup`:
-; - `vbroadcastss` supports 4xfloat, 8xfloat
-; - `vbroadcastsd` supports 4xdouble
-
-; NOTE: The code in this test is a hack. Since TTI cannot currently detect a
-; proper broadcast pattern from a scalar load (like the one that follows),
-; we use a vector load as the shuffle's operand to trigger the pattern.
-;
-; %load = load double, double *%ptr
-; %insert = insertelement <2 x double> poison, double %load, i32 0
-; %bcast = shufflevector <2 x double> %insert, <2 x double> poison, <2 x i32> zeroinitializer
-
-define void @splat_load_2xdouble(<2 x double> *%ptr) {
-; SSE2-LABEL: 'splat_load_2xdouble'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, <2 x double>* %ptr, align 16
-; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat_load = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSE3-LABEL: 'splat_load_2xdouble'
-; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x double>, <2 x double>* %ptr, align 16
-; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %splat_load = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
-; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
- %load = load <2 x double>, <2 x double> *%ptr
- %splat_load = shufflevector <2 x double> %load, <2 x double> poison, <2 x i32> zeroinitializer
- ret void
-}
-
-define void @splat_load_2xfloat(<2 x float> *%ptr) {
-; SSE2-LABEL: 'splat_load_2xfloat'
-; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x float>, <2 x float>* %ptr, align 8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat_load = shufflevector <2 x float> %load, <2 x float> poison, <2 x i32> zeroinitializer
-; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSE3-LABEL: 'splat_load_2xfloat'
-; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %load = load <2 x float>, <2 x float>* %ptr, align 8
-; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %splat_load = shufflevector <2 x float> %load, <2 x float> poison, <2 x i32> zeroinitializer
-; SSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
- %load = load <2 x float>, <2 x float> *%ptr
- %splat_load = shufflevector <2 x float> %load, <2 x float> poison, <2 x i32> zeroinitializer
- ret void
-}
More information about the llvm-commits
mailing list