[llvm] a640aa5 - [CostModel][X86] Add insertelement costs into a known base vector value

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Tue Oct 11 04:09:04 PDT 2022


Author: Simon Pilgrim
Date: 2022-10-11T12:07:25+01:00
New Revision: a640aa5bfd9452967bf5cdd804baea75e7f5a433

URL: https://github.com/llvm/llvm-project/commit/a640aa5bfd9452967bf5cdd804baea75e7f5a433
DIFF: https://github.com/llvm/llvm-project/commit/a640aa5bfd9452967bf5cdd804baea75e7f5a433.diff

LOG: [CostModel][X86] Add insertelement costs into a known base vector value

We were only testing inserting into undef/poison base vectors

Test coverage for Issue #58261

Added: 
    llvm/test/Analysis/CostModel/X86/vector-insert-value.ll

Modified: 
    

Removed: 
    


################################################################################
diff  --git a/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
new file mode 100644
index 0000000000000..1053c19641667
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/X86/vector-insert-value.ll
@@ -0,0 +1,1303 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE4
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
+;
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SLM
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,GLM
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+
+define i32 @insert_double(i32 %arg, double %val, <2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
+; SSE-LABEL: 'insert_double'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_a = insertelement <2 x double> %src128, double %val, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_0 = insertelement <2 x double> %src128, double %val, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> %src128, double %val, i32 1
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4f64_a = insertelement <4 x double> %src256, double %val, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> %src256, double %val, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_3 = insertelement <4 x double> %src256, double %val, i32 3
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8f64_a = insertelement <8 x double> %src512, double %val, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> %src512, double %val, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_3 = insertelement <8 x double> %src512, double %val, i32 3
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = insertelement <8 x double> %src512, double %val, i32 4
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f64_7 = insertelement <8 x double> %src512, double %val, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_double'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_a = insertelement <2 x double> %src128, double %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_0 = insertelement <2 x double> %src128, double %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> %src128, double %val, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_a = insertelement <4 x double> %src256, double %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> %src256, double %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_3 = insertelement <4 x double> %src256, double %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f64_a = insertelement <8 x double> %src512, double %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> %src512, double %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_3 = insertelement <8 x double> %src512, double %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_4 = insertelement <8 x double> %src512, double %val, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_7 = insertelement <8 x double> %src512, double %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_double'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_a = insertelement <2 x double> %src128, double %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f64_0 = insertelement <2 x double> %src128, double %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_1 = insertelement <2 x double> %src128, double %val, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_a = insertelement <4 x double> %src256, double %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f64_0 = insertelement <4 x double> %src256, double %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_3 = insertelement <4 x double> %src256, double %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_a = insertelement <8 x double> %src512, double %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f64_0 = insertelement <8 x double> %src512, double %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_3 = insertelement <8 x double> %src512, double %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_4 = insertelement <8 x double> %src512, double %val, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_7 = insertelement <8 x double> %src512, double %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %v2f64_a = insertelement <2 x double> %src128, double %val, i32 %arg
+  %v2f64_0 = insertelement <2 x double> %src128, double %val, i32 0
+  %v2f64_1 = insertelement <2 x double> %src128, double %val, i32 1
+
+  %v4f64_a = insertelement <4 x double> %src256, double %val, i32 %arg
+  %v4f64_0 = insertelement <4 x double> %src256, double %val, i32 0
+  %v4f64_3 = insertelement <4 x double> %src256, double %val, i32 3
+
+  %v8f64_a = insertelement <8 x double> %src512, double %val, i32 %arg
+  %v8f64_0 = insertelement <8 x double> %src512, double %val, i32 0
+  %v8f64_3 = insertelement <8 x double> %src512, double %val, i32 3
+  %v8f64_4 = insertelement <8 x double> %src512, double %val, i32 4
+  %v8f64_7 = insertelement <8 x double> %src512, double %val, i32 7
+
+  ret i32 undef
+}
+
+define i32 @insert_float(i32 %arg, float %val, <2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
+; SSE2-LABEL: 'insert_float'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_float'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_float'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_float'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_float'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_float'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_float'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_float'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16f32_a = insertelement <16 x float> %src512, float %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_0 = insertelement <16 x float> %src512, float %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_3 = insertelement <16 x float> %src512, float %val, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v16f32_8 = insertelement <16 x float> %src512, float %val, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %v2f32_a = insertelement <2 x float> %src64, float %val, i32 %arg
+  %v2f32_0 = insertelement <2 x float> %src64, float %val, i32 0
+  %v2f32_1 = insertelement <2 x float> %src64, float %val, i32 1
+
+  %v4f32_a = insertelement <4 x float> %src128, float %val, i32 %arg
+  %v4f32_0 = insertelement <4 x float> %src128, float %val, i32 0
+  %v4f32_3 = insertelement <4 x float> %src128, float %val, i32 3
+
+  %v8f32_a = insertelement <8 x float> %src256, float %val, i32 %arg
+  %v8f32_0 = insertelement <8 x float> %src256, float %val, i32 0
+  %v8f32_3 = insertelement <8 x float> %src256, float %val, i32 3
+  %v8f32_4 = insertelement <8 x float> %src256, float %val, i32 4
+  %v8f32_7 = insertelement <8 x float> %src256, float %val, i32 7
+
+  %v16f32_a  = insertelement <16 x float> %src512, float %val, i32 %arg
+  %v16f32_0  = insertelement <16 x float> %src512, float %val, i32 0
+  %v16f32_3  = insertelement <16 x float> %src512, float %val, i32 3
+  %v16f32_8  = insertelement <16 x float> %src512, float %val, i32 8
+  %v16f32_15 = insertelement <16 x float> %src512, float %val, i32 15
+
+  ret i32 undef
+}
+
+define i32 @insert_i64(i32 %arg, i64 %val, <2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512) {
+; SSE2-LABEL: 'insert_i64'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_a = insertelement <2 x i64> %src128, i64 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_0 = insertelement <2 x i64> %src128, i64 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> %src128, i64 %val, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_a = insertelement <4 x i64> %src256, i64 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = insertelement <4 x i64> %src256, i64 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> %src256, i64 %val, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i64_a = insertelement <8 x i64> %src512, i64 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_0 = insertelement <8 x i64> %src512, i64 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> %src512, i64 %val, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = insertelement <8 x i64> %src512, i64 %val, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> %src512, i64 %val, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i64'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_a = insertelement <2 x i64> %src128, i64 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_0 = insertelement <2 x i64> %src128, i64 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> %src128, i64 %val, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_a = insertelement <4 x i64> %src256, i64 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = insertelement <4 x i64> %src256, i64 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> %src256, i64 %val, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i64_a = insertelement <8 x i64> %src512, i64 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_0 = insertelement <8 x i64> %src512, i64 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> %src512, i64 %val, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = insertelement <8 x i64> %src512, i64 %val, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> %src512, i64 %val, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i64'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_a = insertelement <2 x i64> %src128, i64 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_0 = insertelement <2 x i64> %src128, i64 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64_1 = insertelement <2 x i64> %src128, i64 %val, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_a = insertelement <4 x i64> %src256, i64 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = insertelement <4 x i64> %src256, i64 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = insertelement <4 x i64> %src256, i64 %val, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i64_a = insertelement <8 x i64> %src512, i64 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_0 = insertelement <8 x i64> %src512, i64 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_3 = insertelement <8 x i64> %src512, i64 %val, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_4 = insertelement <8 x i64> %src512, i64 %val, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_7 = insertelement <8 x i64> %src512, i64 %val, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i64'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_a = insertelement <2 x i64> %src128, i64 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> %src128, i64 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> %src128, i64 %val, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_a = insertelement <4 x i64> %src256, i64 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> %src256, i64 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> %src256, i64 %val, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i64_a = insertelement <8 x i64> %src512, i64 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> %src512, i64 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> %src512, i64 %val, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> %src512, i64 %val, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> %src512, i64 %val, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_a = insertelement <2 x i64> %src128, i64 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> %src128, i64 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> %src128, i64 %val, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_a = insertelement <4 x i64> %src256, i64 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> %src256, i64 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_3 = insertelement <4 x i64> %src256, i64 %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i64_a = insertelement <8 x i64> %src512, i64 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> %src512, i64 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_3 = insertelement <8 x i64> %src512, i64 %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> %src512, i64 %val, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_7 = insertelement <8 x i64> %src512, i64 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_i64'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_a = insertelement <2 x i64> %src128, i64 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> %src128, i64 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> %src128, i64 %val, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_a = insertelement <4 x i64> %src256, i64 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> %src256, i64 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i64_3 = insertelement <4 x i64> %src256, i64 %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_a = insertelement <8 x i64> %src512, i64 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> %src512, i64 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_3 = insertelement <8 x i64> %src512, i64 %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_4 = insertelement <8 x i64> %src512, i64 %val, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i64_7 = insertelement <8 x i64> %src512, i64 %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i64'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_a = insertelement <2 x i64> %src128, i64 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> %src128, i64 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> %src128, i64 %val, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_a = insertelement <4 x i64> %src256, i64 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> %src256, i64 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> %src256, i64 %val, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i64_a = insertelement <8 x i64> %src512, i64 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> %src512, i64 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> %src512, i64 %val, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> %src512, i64 %val, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> %src512, i64 %val, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i64'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_a = insertelement <2 x i64> %src128, i64 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = insertelement <2 x i64> %src128, i64 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = insertelement <2 x i64> %src128, i64 %val, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i64_a = insertelement <4 x i64> %src256, i64 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_0 = insertelement <4 x i64> %src256, i64 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_3 = insertelement <4 x i64> %src256, i64 %val, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v8i64_a = insertelement <8 x i64> %src512, i64 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_0 = insertelement <8 x i64> %src512, i64 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_3 = insertelement <8 x i64> %src512, i64 %val, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_4 = insertelement <8 x i64> %src512, i64 %val, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64_7 = insertelement <8 x i64> %src512, i64 %val, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %v2i64_a = insertelement <2 x i64> %src128, i64 %val, i32 %arg
+  %v2i64_0 = insertelement <2 x i64> %src128, i64 %val, i32 0
+  %v2i64_1 = insertelement <2 x i64> %src128, i64 %val, i32 1
+
+  %v4i64_a = insertelement <4 x i64> %src256, i64 %val, i32 %arg
+  %v4i64_0 = insertelement <4 x i64> %src256, i64 %val, i32 0
+  %v4i64_3 = insertelement <4 x i64> %src256, i64 %val, i32 3
+
+  %v8i64_a = insertelement <8 x i64> %src512, i64 %val, i32 %arg
+  %v8i64_0 = insertelement <8 x i64> %src512, i64 %val, i32 0
+  %v8i64_3 = insertelement <8 x i64> %src512, i64 %val, i32 3
+  %v8i64_4 = insertelement <8 x i64> %src512, i64 %val, i32 4
+  %v8i64_7 = insertelement <8 x i64> %src512, i64 %val, i32 7
+
+  ret i32 undef
+}
+
+define i32 @insert_i32(i32 %arg, i32 %val, <2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512) {
+; SSE2-LABEL: 'insert_i32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_a = insertelement <2 x i32> %src64, i32 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_0 = insertelement <2 x i32> %src64, i32 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> %src64, i32 %val, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_a = insertelement <4 x i32> %src128, i32 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = insertelement <4 x i32> %src128, i32 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> %src128, i32 %val, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32_a = insertelement <8 x i32> %src256, i32 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_0 = insertelement <8 x i32> %src256, i32 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> %src256, i32 %val, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> %src256, i32 %val, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> %src256, i32 %val, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i32_a = insertelement <16 x i32> %src512, i32 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_0 = insertelement <16 x i32> %src512, i32 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> %src512, i32 %val, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> %src512, i32 %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> %src512, i32 %val, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i32'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_a = insertelement <2 x i32> %src64, i32 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_0 = insertelement <2 x i32> %src64, i32 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> %src64, i32 %val, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_a = insertelement <4 x i32> %src128, i32 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = insertelement <4 x i32> %src128, i32 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> %src128, i32 %val, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32_a = insertelement <8 x i32> %src256, i32 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_0 = insertelement <8 x i32> %src256, i32 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> %src256, i32 %val, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> %src256, i32 %val, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> %src256, i32 %val, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i32_a = insertelement <16 x i32> %src512, i32 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_0 = insertelement <16 x i32> %src512, i32 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> %src512, i32 %val, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> %src512, i32 %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> %src512, i32 %val, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_a = insertelement <2 x i32> %src64, i32 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_0 = insertelement <2 x i32> %src64, i32 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_1 = insertelement <2 x i32> %src64, i32 %val, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_a = insertelement <4 x i32> %src128, i32 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = insertelement <4 x i32> %src128, i32 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = insertelement <4 x i32> %src128, i32 %val, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32_a = insertelement <8 x i32> %src256, i32 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_0 = insertelement <8 x i32> %src256, i32 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_3 = insertelement <8 x i32> %src256, i32 %val, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> %src256, i32 %val, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> %src256, i32 %val, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i32_a = insertelement <16 x i32> %src512, i32 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_0 = insertelement <16 x i32> %src512, i32 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_3 = insertelement <16 x i32> %src512, i32 %val, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> %src512, i32 %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> %src512, i32 %val, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i32'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_a = insertelement <2 x i32> %src64, i32 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> %src64, i32 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> %src64, i32 %val, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_a = insertelement <4 x i32> %src128, i32 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> %src128, i32 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> %src128, i32 %val, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32_a = insertelement <8 x i32> %src256, i32 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> %src256, i32 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> %src256, i32 %val, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> %src256, i32 %val, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> %src256, i32 %val, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i32_a = insertelement <16 x i32> %src512, i32 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> %src512, i32 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> %src512, i32 %val, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> %src512, i32 %val, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> %src512, i32 %val, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_a = insertelement <2 x i32> %src64, i32 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> %src64, i32 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> %src64, i32 %val, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_a = insertelement <4 x i32> %src128, i32 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> %src128, i32 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> %src128, i32 %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_a = insertelement <8 x i32> %src256, i32 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> %src256, i32 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> %src256, i32 %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> %src256, i32 %val, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> %src256, i32 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i32_a = insertelement <16 x i32> %src512, i32 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> %src512, i32 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> %src512, i32 %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> %src512, i32 %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> %src512, i32 %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_i32'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_a = insertelement <2 x i32> %src64, i32 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> %src64, i32 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> %src64, i32 %val, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_a = insertelement <4 x i32> %src128, i32 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> %src128, i32 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> %src128, i32 %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_a = insertelement <8 x i32> %src256, i32 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> %src256, i32 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> %src256, i32 %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_4 = insertelement <8 x i32> %src256, i32 %val, i32 4
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i32_7 = insertelement <8 x i32> %src256, i32 %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_a = insertelement <16 x i32> %src512, i32 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> %src512, i32 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> %src512, i32 %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_8 = insertelement <16 x i32> %src512, i32 %val, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i32_15 = insertelement <16 x i32> %src512, i32 %val, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i32'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_a = insertelement <2 x i32> %src64, i32 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> %src64, i32 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> %src64, i32 %val, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_a = insertelement <4 x i32> %src128, i32 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> %src128, i32 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> %src128, i32 %val, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32_a = insertelement <8 x i32> %src256, i32 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> %src256, i32 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> %src256, i32 %val, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> %src256, i32 %val, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> %src256, i32 %val, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i32_a = insertelement <16 x i32> %src512, i32 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> %src512, i32 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> %src512, i32 %val, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> %src512, i32 %val, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> %src512, i32 %val, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i32'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_a = insertelement <2 x i32> %src64, i32 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = insertelement <2 x i32> %src64, i32 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = insertelement <2 x i32> %src64, i32 %val, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_a = insertelement <4 x i32> %src128, i32 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = insertelement <4 x i32> %src128, i32 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_3 = insertelement <4 x i32> %src128, i32 %val, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v8i32_a = insertelement <8 x i32> %src256, i32 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = insertelement <8 x i32> %src256, i32 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_3 = insertelement <8 x i32> %src256, i32 %val, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_4 = insertelement <8 x i32> %src256, i32 %val, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_7 = insertelement <8 x i32> %src256, i32 %val, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v16i32_a = insertelement <16 x i32> %src512, i32 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_0 = insertelement <16 x i32> %src512, i32 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_3 = insertelement <16 x i32> %src512, i32 %val, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_8 = insertelement <16 x i32> %src512, i32 %val, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32_15 = insertelement <16 x i32> %src512, i32 %val, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %v2i32_a = insertelement <2 x i32> %src64, i32 %val, i32 %arg
+  %v2i32_0 = insertelement <2 x i32> %src64, i32 %val, i32 0
+  %v2i32_1 = insertelement <2 x i32> %src64, i32 %val, i32 1
+
+  %v4i32_a = insertelement <4 x i32> %src128, i32 %val, i32 %arg
+  %v4i32_0 = insertelement <4 x i32> %src128, i32 %val, i32 0
+  %v4i32_3 = insertelement <4 x i32> %src128, i32 %val, i32 3
+
+  %v8i32_a = insertelement <8 x i32> %src256, i32 %val, i32 %arg
+  %v8i32_0 = insertelement <8 x i32> %src256, i32 %val, i32 0
+  %v8i32_3 = insertelement <8 x i32> %src256, i32 %val, i32 3
+  %v8i32_4 = insertelement <8 x i32> %src256, i32 %val, i32 4
+  %v8i32_7 = insertelement <8 x i32> %src256, i32 %val, i32 7
+
+  %v16i32_a  = insertelement <16 x i32> %src512, i32 %val, i32 %arg
+  %v16i32_0  = insertelement <16 x i32> %src512, i32 %val, i32 0
+  %v16i32_3  = insertelement <16 x i32> %src512, i32 %val, i32 3
+  %v16i32_8  = insertelement <16 x i32> %src512, i32 %val, i32 8
+  %v16i32_15 = insertelement <16 x i32> %src512, i32 %val, i32 15
+
+  ret i32 undef
+}
+
+define i32 @insert_i16(i32 %arg, i16 %val, <2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512) {
+; SSE-LABEL: 'insert_i16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_a = insertelement <2 x i16> %src32, i16 %val, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = insertelement <2 x i16> %src32, i16 %val, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = insertelement <2 x i16> %src32, i16 %val, i32 1
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_a = insertelement <4 x i16> %src64, i16 %val, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = insertelement <4 x i16> %src64, i16 %val, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = insertelement <4 x i16> %src64, i16 %val, i32 3
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16_a = insertelement <8 x i16> %src128, i16 %val, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> %src128, i16 %val, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> %src128, i16 %val, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v16i16_a = insertelement <16 x i16> %src256, i16 %val, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> %src256, i16 %val, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> %src256, i16 %val, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_8 = insertelement <16 x i16> %src256, i16 %val, i32 8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_15 = insertelement <16 x i16> %src256, i16 %val, i32 15
+; SSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i16_a = insertelement <32 x i16> %src512, i16 %val, i32 %arg
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> %src512, i16 %val, i32 0
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> %src512, i16 %val, i32 7
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_8 = insertelement <32 x i16> %src512, i16 %val, i32 8
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_15 = insertelement <32 x i16> %src512, i16 %val, i32 15
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> %src512, i16 %val, i32 16
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_24 = insertelement <32 x i16> %src512, i16 %val, i32 24
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_31 = insertelement <32 x i16> %src512, i16 %val, i32 31
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_a = insertelement <2 x i16> %src32, i16 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = insertelement <2 x i16> %src32, i16 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = insertelement <2 x i16> %src32, i16 %val, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_a = insertelement <4 x i16> %src64, i16 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = insertelement <4 x i16> %src64, i16 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = insertelement <4 x i16> %src64, i16 %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16_a = insertelement <8 x i16> %src128, i16 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> %src128, i16 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> %src128, i16 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_a = insertelement <16 x i16> %src256, i16 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> %src256, i16 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> %src256, i16 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> %src256, i16 %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> %src256, i16 %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i16_a = insertelement <32 x i16> %src512, i16 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> %src512, i16 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> %src512, i16 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> %src512, i16 %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> %src512, i16 %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_16 = insertelement <32 x i16> %src512, i16 %val, i32 16
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> %src512, i16 %val, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> %src512, i16 %val, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_i16'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_a = insertelement <2 x i16> %src32, i16 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_0 = insertelement <2 x i16> %src32, i16 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_1 = insertelement <2 x i16> %src32, i16 %val, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_a = insertelement <4 x i16> %src64, i16 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = insertelement <4 x i16> %src64, i16 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_3 = insertelement <4 x i16> %src64, i16 %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i16_a = insertelement <8 x i16> %src128, i16 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = insertelement <8 x i16> %src128, i16 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_7 = insertelement <8 x i16> %src128, i16 %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_a = insertelement <16 x i16> %src256, i16 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = insertelement <16 x i16> %src256, i16 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_7 = insertelement <16 x i16> %src256, i16 %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_8 = insertelement <16 x i16> %src256, i16 %val, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i16_15 = insertelement <16 x i16> %src256, i16 %val, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_a = insertelement <32 x i16> %src512, i16 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_0 = insertelement <32 x i16> %src512, i16 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i16_7 = insertelement <32 x i16> %src512, i16 %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_8 = insertelement <32 x i16> %src512, i16 %val, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_15 = insertelement <32 x i16> %src512, i16 %val, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_16 = insertelement <32 x i16> %src512, i16 %val, i32 16
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_24 = insertelement <32 x i16> %src512, i16 %val, i32 24
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i16_31 = insertelement <32 x i16> %src512, i16 %val, i32 31
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %v2i16_a = insertelement <2 x i16> %src32, i16 %val, i32 %arg
+  %v2i16_0 = insertelement <2 x i16> %src32, i16 %val, i32 0
+  %v2i16_1 = insertelement <2 x i16> %src32, i16 %val, i32 1
+
+  %v4i16_a = insertelement <4 x i16> %src64, i16 %val, i32 %arg
+  %v4i16_0 = insertelement <4 x i16> %src64, i16 %val, i32 0
+  %v4i16_3 = insertelement <4 x i16> %src64, i16 %val, i32 3
+
+  %v8i16_a = insertelement <8 x i16> %src128, i16 %val, i32 %arg
+  %v8i16_0 = insertelement <8 x i16> %src128, i16 %val, i32 0
+  %v8i16_7 = insertelement <8 x i16> %src128, i16 %val, i32 7
+
+  %v16i16_a  = insertelement <16 x i16> %src256, i16 %val, i32 %arg
+  %v16i16_0  = insertelement <16 x i16> %src256, i16 %val, i32 0
+  %v16i16_7  = insertelement <16 x i16> %src256, i16 %val, i32 7
+  %v16i16_8  = insertelement <16 x i16> %src256, i16 %val, i32 8
+  %v16i16_15 = insertelement <16 x i16> %src256, i16 %val, i32 15
+
+  %v32i16_a  = insertelement <32 x i16> %src512, i16 %val, i32 %arg
+  %v32i16_0  = insertelement <32 x i16> %src512, i16 %val, i32 0
+  %v32i16_7  = insertelement <32 x i16> %src512, i16 %val, i32 7
+  %v32i16_8  = insertelement <32 x i16> %src512, i16 %val, i32 8
+  %v32i16_15 = insertelement <32 x i16> %src512, i16 %val, i32 15
+  %v32i16_16 = insertelement <32 x i16> %src512, i16 %val, i32 16
+  %v32i16_24 = insertelement <32 x i16> %src512, i16 %val, i32 24
+  %v32i16_31 = insertelement <32 x i16> %src512, i16 %val, i32 31
+
+  ret i32 undef
+}
+
+define i32 @insert_i8(i32 %arg, i8 %val, <2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
+; SSE2-LABEL: 'insert_i8'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_a = insertelement <2 x i8> %src16, i8 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_0 = insertelement <2 x i8> %src16, i8 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_3 = insertelement <2 x i8> %src16, i8 %val, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_a = insertelement <4 x i8> %src32, i8 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i8_0 = insertelement <4 x i8> %src32, i8 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i8_3 = insertelement <4 x i8> %src32, i8 %val, i32 3
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_a = insertelement <8 x i8> %src64, i8 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_0 = insertelement <8 x i8> %src64, i8 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_7 = insertelement <8 x i8> %src64, i8 %val, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_a = insertelement <16 x i8> %src128, i8 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_0 = insertelement <16 x i8> %src128, i8 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> %src128, i8 %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_15 = insertelement <16 x i8> %src128, i8 %val, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_a = insertelement <32 x i8> %src256, i8 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_0 = insertelement <32 x i8> %src256, i8 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_7 = insertelement <32 x i8> %src256, i8 %val, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_8 = insertelement <32 x i8> %src256, i8 %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_15 = insertelement <32 x i8> %src256, i8 %val, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_24 = insertelement <32 x i8> %src256, i8 %val, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_31 = insertelement <32 x i8> %src256, i8 %val, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i8_a = insertelement <64 x i8> %src512, i8 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_0 = insertelement <64 x i8> %src512, i8 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_7 = insertelement <64 x i8> %src512, i8 %val, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_8 = insertelement <64 x i8> %src512, i8 %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_15 = insertelement <64 x i8> %src512, i8 %val, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_24 = insertelement <64 x i8> %src512, i8 %val, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_31 = insertelement <64 x i8> %src512, i8 %val, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_32 = insertelement <64 x i8> %src512, i8 %val, i32 32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_48 = insertelement <64 x i8> %src512, i8 %val, i32 48
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_63 = insertelement <64 x i8> %src512, i8 %val, i32 63
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i8'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_a = insertelement <2 x i8> %src16, i8 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_0 = insertelement <2 x i8> %src16, i8 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_3 = insertelement <2 x i8> %src16, i8 %val, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_a = insertelement <4 x i8> %src32, i8 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i8_0 = insertelement <4 x i8> %src32, i8 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v4i8_3 = insertelement <4 x i8> %src32, i8 %val, i32 3
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_a = insertelement <8 x i8> %src64, i8 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_0 = insertelement <8 x i8> %src64, i8 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i8_7 = insertelement <8 x i8> %src64, i8 %val, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_a = insertelement <16 x i8> %src128, i8 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_0 = insertelement <16 x i8> %src128, i8 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> %src128, i8 %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i8_15 = insertelement <16 x i8> %src128, i8 %val, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_a = insertelement <32 x i8> %src256, i8 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_0 = insertelement <32 x i8> %src256, i8 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_7 = insertelement <32 x i8> %src256, i8 %val, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_8 = insertelement <32 x i8> %src256, i8 %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_15 = insertelement <32 x i8> %src256, i8 %val, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_24 = insertelement <32 x i8> %src256, i8 %val, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i8_31 = insertelement <32 x i8> %src256, i8 %val, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i8_a = insertelement <64 x i8> %src512, i8 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_0 = insertelement <64 x i8> %src512, i8 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_7 = insertelement <64 x i8> %src512, i8 %val, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_8 = insertelement <64 x i8> %src512, i8 %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_15 = insertelement <64 x i8> %src512, i8 %val, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_24 = insertelement <64 x i8> %src512, i8 %val, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_31 = insertelement <64 x i8> %src512, i8 %val, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_32 = insertelement <64 x i8> %src512, i8 %val, i32 32
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_48 = insertelement <64 x i8> %src512, i8 %val, i32 48
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i8_63 = insertelement <64 x i8> %src512, i8 %val, i32 63
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i8'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_a = insertelement <2 x i8> %src16, i8 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_0 = insertelement <2 x i8> %src16, i8 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i8_3 = insertelement <2 x i8> %src16, i8 %val, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_a = insertelement <4 x i8> %src32, i8 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_0 = insertelement <4 x i8> %src32, i8 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i8_3 = insertelement <4 x i8> %src32, i8 %val, i32 3
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_a = insertelement <8 x i8> %src64, i8 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_0 = insertelement <8 x i8> %src64, i8 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i8_7 = insertelement <8 x i8> %src64, i8 %val, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_a = insertelement <16 x i8> %src128, i8 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_0 = insertelement <16 x i8> %src128, i8 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_8 = insertelement <16 x i8> %src128, i8 %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8_15 = insertelement <16 x i8> %src128, i8 %val, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_a = insertelement <32 x i8> %src256, i8 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_0 = insertelement <32 x i8> %src256, i8 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_7 = insertelement <32 x i8> %src256, i8 %val, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_8 = insertelement <32 x i8> %src256, i8 %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_15 = insertelement <32 x i8> %src256, i8 %val, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_24 = insertelement <32 x i8> %src256, i8 %val, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i8_31 = insertelement <32 x i8> %src256, i8 %val, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i8_a = insertelement <64 x i8> %src512, i8 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_0 = insertelement <64 x i8> %src512, i8 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_7 = insertelement <64 x i8> %src512, i8 %val, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_8 = insertelement <64 x i8> %src512, i8 %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_15 = insertelement <64 x i8> %src512, i8 %val, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_24 = insertelement <64 x i8> %src512, i8 %val, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_31 = insertelement <64 x i8> %src512, i8 %val, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_32 = insertelement <64 x i8> %src512, i8 %val, i32 32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_48 = insertelement <64 x i8> %src512, i8 %val, i32 48
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i8_63 = insertelement <64 x i8> %src512, i8 %val, i32 63
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i8'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_a = insertelement <2 x i8> %src16, i8 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> %src16, i8 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> %src16, i8 %val, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_a = insertelement <4 x i8> %src32, i8 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> %src32, i8 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> %src32, i8 %val, i32 3
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_a = insertelement <8 x i8> %src64, i8 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> %src64, i8 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> %src64, i8 %val, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_a = insertelement <16 x i8> %src128, i8 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> %src128, i8 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> %src128, i8 %val, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> %src128, i8 %val, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_a = insertelement <32 x i8> %src256, i8 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> %src256, i8 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> %src256, i8 %val, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> %src256, i8 %val, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> %src256, i8 %val, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> %src256, i8 %val, i32 24
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> %src256, i8 %val, i32 31
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i8_a = insertelement <64 x i8> %src512, i8 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> %src512, i8 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> %src512, i8 %val, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> %src512, i8 %val, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> %src512, i8 %val, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> %src512, i8 %val, i32 24
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> %src512, i8 %val, i32 31
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> %src512, i8 %val, i32 32
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> %src512, i8 %val, i32 48
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> %src512, i8 %val, i32 63
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_a = insertelement <2 x i8> %src16, i8 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> %src16, i8 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> %src16, i8 %val, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_a = insertelement <4 x i8> %src32, i8 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> %src32, i8 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> %src32, i8 %val, i32 3
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_a = insertelement <8 x i8> %src64, i8 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> %src64, i8 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> %src64, i8 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_a = insertelement <16 x i8> %src128, i8 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> %src128, i8 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> %src128, i8 %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> %src128, i8 %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_a = insertelement <32 x i8> %src256, i8 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> %src256, i8 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> %src256, i8 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> %src256, i8 %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> %src256, i8 %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> %src256, i8 %val, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> %src256, i8 %val, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i8_a = insertelement <64 x i8> %src512, i8 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> %src512, i8 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> %src512, i8 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> %src512, i8 %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> %src512, i8 %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> %src512, i8 %val, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> %src512, i8 %val, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> %src512, i8 %val, i32 32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> %src512, i8 %val, i32 48
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> %src512, i8 %val, i32 63
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512-LABEL: 'insert_i8'
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_a = insertelement <2 x i8> %src16, i8 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> %src16, i8 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> %src16, i8 %val, i32 1
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_a = insertelement <4 x i8> %src32, i8 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> %src32, i8 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> %src32, i8 %val, i32 3
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_a = insertelement <8 x i8> %src64, i8 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> %src64, i8 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> %src64, i8 %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_a = insertelement <16 x i8> %src128, i8 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> %src128, i8 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> %src128, i8 %val, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> %src128, i8 %val, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_a = insertelement <32 x i8> %src256, i8 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> %src256, i8 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> %src256, i8 %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> %src256, i8 %val, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> %src256, i8 %val, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_24 = insertelement <32 x i8> %src256, i8 %val, i32 24
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i8_31 = insertelement <32 x i8> %src256, i8 %val, i32 31
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_a = insertelement <64 x i8> %src512, i8 %val, i32 %arg
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> %src512, i8 %val, i32 0
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> %src512, i8 %val, i32 7
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> %src512, i8 %val, i32 8
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> %src512, i8 %val, i32 15
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_24 = insertelement <64 x i8> %src512, i8 %val, i32 24
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_31 = insertelement <64 x i8> %src512, i8 %val, i32 31
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_32 = insertelement <64 x i8> %src512, i8 %val, i32 32
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_48 = insertelement <64 x i8> %src512, i8 %val, i32 48
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i8_63 = insertelement <64 x i8> %src512, i8 %val, i32 63
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i8'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_a = insertelement <2 x i8> %src16, i8 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> %src16, i8 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> %src16, i8 %val, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_a = insertelement <4 x i8> %src32, i8 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> %src32, i8 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> %src32, i8 %val, i32 3
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_a = insertelement <8 x i8> %src64, i8 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> %src64, i8 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> %src64, i8 %val, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_a = insertelement <16 x i8> %src128, i8 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> %src128, i8 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> %src128, i8 %val, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> %src128, i8 %val, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_a = insertelement <32 x i8> %src256, i8 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> %src256, i8 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> %src256, i8 %val, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> %src256, i8 %val, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> %src256, i8 %val, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> %src256, i8 %val, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> %src256, i8 %val, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i8_a = insertelement <64 x i8> %src512, i8 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> %src512, i8 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> %src512, i8 %val, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> %src512, i8 %val, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> %src512, i8 %val, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> %src512, i8 %val, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> %src512, i8 %val, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> %src512, i8 %val, i32 32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> %src512, i8 %val, i32 48
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> %src512, i8 %val, i32 63
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i8'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_a = insertelement <2 x i8> %src16, i8 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_0 = insertelement <2 x i8> %src16, i8 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8_3 = insertelement <2 x i8> %src16, i8 %val, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_a = insertelement <4 x i8> %src32, i8 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = insertelement <4 x i8> %src32, i8 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_3 = insertelement <4 x i8> %src32, i8 %val, i32 3
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_a = insertelement <8 x i8> %src64, i8 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = insertelement <8 x i8> %src64, i8 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_7 = insertelement <8 x i8> %src64, i8 %val, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i8_a = insertelement <16 x i8> %src128, i8 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = insertelement <16 x i8> %src128, i8 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_8 = insertelement <16 x i8> %src128, i8 %val, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_15 = insertelement <16 x i8> %src128, i8 %val, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i8_a = insertelement <32 x i8> %src256, i8 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = insertelement <32 x i8> %src256, i8 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_7 = insertelement <32 x i8> %src256, i8 %val, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_8 = insertelement <32 x i8> %src256, i8 %val, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_15 = insertelement <32 x i8> %src256, i8 %val, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_24 = insertelement <32 x i8> %src256, i8 %val, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_31 = insertelement <32 x i8> %src256, i8 %val, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i8_a = insertelement <64 x i8> %src512, i8 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_0 = insertelement <64 x i8> %src512, i8 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_7 = insertelement <64 x i8> %src512, i8 %val, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_8 = insertelement <64 x i8> %src512, i8 %val, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_15 = insertelement <64 x i8> %src512, i8 %val, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_24 = insertelement <64 x i8> %src512, i8 %val, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_31 = insertelement <64 x i8> %src512, i8 %val, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_32 = insertelement <64 x i8> %src512, i8 %val, i32 32
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_48 = insertelement <64 x i8> %src512, i8 %val, i32 48
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i8_63 = insertelement <64 x i8> %src512, i8 %val, i32 63
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %v2i8_a   = insertelement <2 x i8> %src16, i8 %val, i32 %arg
+  %v2i8_0   = insertelement <2 x i8> %src16, i8 %val, i32 0
+  %v2i8_3   = insertelement <2 x i8> %src16, i8 %val, i32 1
+
+  %v4i8_a   = insertelement <4 x i8> %src32, i8 %val, i32 %arg
+  %v4i8_0   = insertelement <4 x i8> %src32, i8 %val, i32 0
+  %v4i8_3   = insertelement <4 x i8> %src32, i8 %val, i32 3
+
+  %v8i8_a   = insertelement <8 x i8> %src64, i8 %val, i32 %arg
+  %v8i8_0   = insertelement <8 x i8> %src64, i8 %val, i32 0
+  %v8i8_7   = insertelement <8 x i8> %src64, i8 %val, i32 7
+
+  %v16i8_a  = insertelement <16 x i8> %src128, i8 %val, i32 %arg
+  %v16i8_0  = insertelement <16 x i8> %src128, i8 %val, i32 0
+  %v16i8_8  = insertelement <16 x i8> %src128, i8 %val, i32 8
+  %v16i8_15 = insertelement <16 x i8> %src128, i8 %val, i32 15
+
+  %v32i8_a  = insertelement <32 x i8> %src256, i8 %val, i32 %arg
+  %v32i8_0  = insertelement <32 x i8> %src256, i8 %val, i32 0
+  %v32i8_7  = insertelement <32 x i8> %src256, i8 %val, i32 7
+  %v32i8_8  = insertelement <32 x i8> %src256, i8 %val, i32 8
+  %v32i8_15 = insertelement <32 x i8> %src256, i8 %val, i32 15
+  %v32i8_24 = insertelement <32 x i8> %src256, i8 %val, i32 24
+  %v32i8_31 = insertelement <32 x i8> %src256, i8 %val, i32 31
+
+  %v64i8_a  = insertelement <64 x i8> %src512, i8 %val, i32 %arg
+  %v64i8_0  = insertelement <64 x i8> %src512, i8 %val, i32 0
+  %v64i8_7  = insertelement <64 x i8> %src512, i8 %val, i32 7
+  %v64i8_8  = insertelement <64 x i8> %src512, i8 %val, i32 8
+  %v64i8_15 = insertelement <64 x i8> %src512, i8 %val, i32 15
+  %v64i8_24 = insertelement <64 x i8> %src512, i8 %val, i32 24
+  %v64i8_31 = insertelement <64 x i8> %src512, i8 %val, i32 31
+  %v64i8_32 = insertelement <64 x i8> %src512, i8 %val, i32 32
+  %v64i8_48 = insertelement <64 x i8> %src512, i8 %val, i32 48
+  %v64i8_63 = insertelement <64 x i8> %src512, i8 %val, i32 63
+
+  ret i32 undef
+}
+
+define i32 @insert_i1(i32 %arg, i1 %val, <2 x i1> %src2, <4 x i1> %src4, <8 x i1> %src8, <16 x i1> %src16, <32 x i1> %src32, <64 x i1> %src64) {
+; SSE2-LABEL: 'insert_i1'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_a = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_0 = insertelement <2 x i1> %src2, i1 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_1 = insertelement <2 x i1> %src2, i1 %val, i32 1
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_a = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = insertelement <4 x i1> %src4, i1 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_2 = insertelement <4 x i1> %src4, i1 %val, i32 2
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_a = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_0 = insertelement <8 x i1> %src8, i1 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_4 = insertelement <8 x i1> %src8, i1 %val, i32 4
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_a = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i1_0 = insertelement <16 x i1> %src16, i1 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i1_8 = insertelement <16 x i1> %src16, i1 %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_a = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_0 = insertelement <32 x i1> %src32, i1 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_7 = insertelement <32 x i1> %src32, i1 %val, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_8 = insertelement <32 x i1> %src32, i1 %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_a = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_0 = insertelement <64 x i1> %src64, i1 %val, i32 0
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_7 = insertelement <64 x i1> %src64, i1 %val, i32 7
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_8 = insertelement <64 x i1> %src64, i1 %val, i32 8
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE3-LABEL: 'insert_i1'
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_a = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_0 = insertelement <2 x i1> %src2, i1 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_1 = insertelement <2 x i1> %src2, i1 %val, i32 1
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_a = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = insertelement <4 x i1> %src4, i1 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_2 = insertelement <4 x i1> %src4, i1 %val, i32 2
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_a = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_0 = insertelement <8 x i1> %src8, i1 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_4 = insertelement <8 x i1> %src8, i1 %val, i32 4
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_a = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i1_0 = insertelement <16 x i1> %src16, i1 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i1_8 = insertelement <16 x i1> %src16, i1 %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_a = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_0 = insertelement <32 x i1> %src32, i1 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_7 = insertelement <32 x i1> %src32, i1 %val, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_8 = insertelement <32 x i1> %src32, i1 %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_a = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_0 = insertelement <64 x i1> %src64, i1 %val, i32 0
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_7 = insertelement <64 x i1> %src64, i1 %val, i32 7
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_8 = insertelement <64 x i1> %src64, i1 %val, i32 8
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+; SSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSSE3-LABEL: 'insert_i1'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_a = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_0 = insertelement <2 x i1> %src2, i1 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i1_1 = insertelement <2 x i1> %src2, i1 %val, i32 1
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_a = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_0 = insertelement <4 x i1> %src4, i1 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_2 = insertelement <4 x i1> %src4, i1 %val, i32 2
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_a = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_0 = insertelement <8 x i1> %src8, i1 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_4 = insertelement <8 x i1> %src8, i1 %val, i32 4
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_a = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_0 = insertelement <16 x i1> %src16, i1 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_8 = insertelement <16 x i1> %src16, i1 %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_a = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_0 = insertelement <32 x i1> %src32, i1 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_7 = insertelement <32 x i1> %src32, i1 %val, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_8 = insertelement <32 x i1> %src32, i1 %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_a = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i1_0 = insertelement <64 x i1> %src64, i1 %val, i32 0
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i1_7 = insertelement <64 x i1> %src64, i1 %val, i32 7
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i1_8 = insertelement <64 x i1> %src64, i1 %val, i32 8
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SSE4-LABEL: 'insert_i1'
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_a = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_0 = insertelement <2 x i1> %src2, i1 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_1 = insertelement <2 x i1> %src2, i1 %val, i32 1
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_a = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_0 = insertelement <4 x i1> %src4, i1 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_2 = insertelement <4 x i1> %src4, i1 %val, i32 2
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_a = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_0 = insertelement <8 x i1> %src8, i1 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_4 = insertelement <8 x i1> %src8, i1 %val, i32 4
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_a = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_0 = insertelement <16 x i1> %src16, i1 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_8 = insertelement <16 x i1> %src16, i1 %val, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_a = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_0 = insertelement <32 x i1> %src32, i1 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_7 = insertelement <32 x i1> %src32, i1 %val, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_8 = insertelement <32 x i1> %src32, i1 %val, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_a = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_0 = insertelement <64 x i1> %src64, i1 %val, i32 0
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_7 = insertelement <64 x i1> %src64, i1 %val, i32 7
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_8 = insertelement <64 x i1> %src64, i1 %val, i32 8
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+; SSE4-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX-LABEL: 'insert_i1'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_a = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_0 = insertelement <2 x i1> %src2, i1 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_1 = insertelement <2 x i1> %src2, i1 %val, i32 1
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_a = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_0 = insertelement <4 x i1> %src4, i1 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_2 = insertelement <4 x i1> %src4, i1 %val, i32 2
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_a = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_0 = insertelement <8 x i1> %src8, i1 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_4 = insertelement <8 x i1> %src8, i1 %val, i32 4
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_a = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_0 = insertelement <16 x i1> %src16, i1 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_8 = insertelement <16 x i1> %src16, i1 %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_a = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_0 = insertelement <32 x i1> %src32, i1 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_7 = insertelement <32 x i1> %src32, i1 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_8 = insertelement <32 x i1> %src32, i1 %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_a = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_0 = insertelement <64 x i1> %src64, i1 %val, i32 0
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_7 = insertelement <64 x i1> %src64, i1 %val, i32 7
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_8 = insertelement <64 x i1> %src64, i1 %val, i32 8
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512F-LABEL: 'insert_i1'
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_a = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_0 = insertelement <2 x i1> %src2, i1 %val, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_1 = insertelement <2 x i1> %src2, i1 %val, i32 1
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_a = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_0 = insertelement <4 x i1> %src4, i1 %val, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_2 = insertelement <4 x i1> %src4, i1 %val, i32 2
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_a = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_0 = insertelement <8 x i1> %src8, i1 %val, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_4 = insertelement <8 x i1> %src8, i1 %val, i32 4
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_a = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_0 = insertelement <16 x i1> %src16, i1 %val, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_8 = insertelement <16 x i1> %src16, i1 %val, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v32i1_a = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_0 = insertelement <32 x i1> %src32, i1 %val, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_7 = insertelement <32 x i1> %src32, i1 %val, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_8 = insertelement <32 x i1> %src32, i1 %val, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64i1_a = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_0 = insertelement <64 x i1> %src64, i1 %val, i32 0
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_7 = insertelement <64 x i1> %src64, i1 %val, i32 7
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_8 = insertelement <64 x i1> %src64, i1 %val, i32 8
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; AVX512BW-LABEL: 'insert_i1'
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_a = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_0 = insertelement <2 x i1> %src2, i1 %val, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_1 = insertelement <2 x i1> %src2, i1 %val, i32 1
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_a = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_0 = insertelement <4 x i1> %src4, i1 %val, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_2 = insertelement <4 x i1> %src4, i1 %val, i32 2
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_a = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_0 = insertelement <8 x i1> %src8, i1 %val, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_4 = insertelement <8 x i1> %src8, i1 %val, i32 4
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_a = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_0 = insertelement <16 x i1> %src16, i1 %val, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_8 = insertelement <16 x i1> %src16, i1 %val, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_a = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_0 = insertelement <32 x i1> %src32, i1 %val, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_7 = insertelement <32 x i1> %src32, i1 %val, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_8 = insertelement <32 x i1> %src32, i1 %val, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_a = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_0 = insertelement <64 x i1> %src64, i1 %val, i32 0
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_7 = insertelement <64 x i1> %src64, i1 %val, i32 7
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_8 = insertelement <64 x i1> %src64, i1 %val, i32 8
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; SLM-LABEL: 'insert_i1'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_a = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_0 = insertelement <2 x i1> %src2, i1 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_1 = insertelement <2 x i1> %src2, i1 %val, i32 1
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_a = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_0 = insertelement <4 x i1> %src4, i1 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_2 = insertelement <4 x i1> %src4, i1 %val, i32 2
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_a = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_0 = insertelement <8 x i1> %src8, i1 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_4 = insertelement <8 x i1> %src8, i1 %val, i32 4
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_a = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_0 = insertelement <16 x i1> %src16, i1 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_8 = insertelement <16 x i1> %src16, i1 %val, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_a = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_0 = insertelement <32 x i1> %src32, i1 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_7 = insertelement <32 x i1> %src32, i1 %val, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_8 = insertelement <32 x i1> %src32, i1 %val, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_a = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_0 = insertelement <64 x i1> %src64, i1 %val, i32 0
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_7 = insertelement <64 x i1> %src64, i1 %val, i32 7
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_8 = insertelement <64 x i1> %src64, i1 %val, i32 8
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; GLM-LABEL: 'insert_i1'
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_a = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_0 = insertelement <2 x i1> %src2, i1 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i1_1 = insertelement <2 x i1> %src2, i1 %val, i32 1
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_a = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_0 = insertelement <4 x i1> %src4, i1 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i1_2 = insertelement <4 x i1> %src4, i1 %val, i32 2
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i1_a = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_0 = insertelement <8 x i1> %src8, i1 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i1_4 = insertelement <8 x i1> %src8, i1 %val, i32 4
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16i1_a = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_0 = insertelement <16 x i1> %src16, i1 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_8 = insertelement <16 x i1> %src16, i1 %val, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32i1_a = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_0 = insertelement <32 x i1> %src32, i1 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_7 = insertelement <32 x i1> %src32, i1 %val, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_8 = insertelement <32 x i1> %src32, i1 %val, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v64i1_a = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_0 = insertelement <64 x i1> %src64, i1 %val, i32 0
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_7 = insertelement <64 x i1> %src64, i1 %val, i32 7
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_8 = insertelement <64 x i1> %src64, i1 %val, i32 8
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+; GLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+; GLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+  %v2i1_a  = insertelement <2 x i1> %src2, i1 %val, i32 %arg
+  %v2i1_0  = insertelement <2 x i1> %src2, i1 %val, i32 0
+  %v2i1_1  = insertelement <2 x i1> %src2, i1 %val, i32 1
+
+  %v4i1_a  = insertelement <4 x i1> %src4, i1 %val, i32 %arg
+  %v4i1_0  = insertelement <4 x i1> %src4, i1 %val, i32 0
+  %v4i1_2  = insertelement <4 x i1> %src4, i1 %val, i32 2
+
+  %v8i1_a  = insertelement <8 x i1> %src8, i1 %val, i32 %arg
+  %v8i1_0  = insertelement <8 x i1> %src8, i1 %val, i32 0
+  %v8i1_4  = insertelement <8 x i1> %src8, i1 %val, i32 4
+
+  %v16i1_a  = insertelement <16 x i1> %src16, i1 %val, i32 %arg
+  %v16i1_0  = insertelement <16 x i1> %src16, i1 %val, i32 0
+  %v16i1_8  = insertelement <16 x i1> %src16, i1 %val, i32 8
+  %v16i1_15 = insertelement <16 x i1> %src16, i1 %val, i32 15
+
+  %v32i1_a  = insertelement <32 x i1> %src32, i1 %val, i32 %arg
+  %v32i1_0  = insertelement <32 x i1> %src32, i1 %val, i32 0
+  %v32i1_7  = insertelement <32 x i1> %src32, i1 %val, i32 7
+  %v32i1_8  = insertelement <32 x i1> %src32, i1 %val, i32 8
+  %v32i1_15 = insertelement <32 x i1> %src32, i1 %val, i32 15
+  %v32i1_24 = insertelement <32 x i1> %src32, i1 %val, i32 24
+  %v32i1_31 = insertelement <32 x i1> %src32, i1 %val, i32 31
+
+  %v64i1_a  = insertelement <64 x i1> %src64, i1 %val, i32 %arg
+  %v64i1_0  = insertelement <64 x i1> %src64, i1 %val, i32 0
+  %v64i1_7  = insertelement <64 x i1> %src64, i1 %val, i32 7
+  %v64i1_8  = insertelement <64 x i1> %src64, i1 %val, i32 8
+  %v64i1_15 = insertelement <64 x i1> %src64, i1 %val, i32 15
+  %v64i1_24 = insertelement <64 x i1> %src64, i1 %val, i32 24
+  %v64i1_31 = insertelement <64 x i1> %src64, i1 %val, i32 31
+  %v64i1_32 = insertelement <64 x i1> %src64, i1 %val, i32 32
+  %v64i1_48 = insertelement <64 x i1> %src64, i1 %val, i32 48
+  %v64i1_63 = insertelement <64 x i1> %src64, i1 %val, i32 63
+
+  ret i32 undef
+}


        


More information about the llvm-commits mailing list