[llvm] 064cd2e - [x86] allow peeking through an extract_subvector to find a splatted operand
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 20 11:03:28 PST 2020
Author: Sanjay Patel
Date: 2020-02-20T13:59:59-05:00
New Revision: 064cd2ecdb3d0c52be5b6cf4fc67125baa714d3a
URL: https://github.com/llvm/llvm-project/commit/064cd2ecdb3d0c52be5b6cf4fc67125baa714d3a
DIFF: https://github.com/llvm/llvm-project/commit/064cd2ecdb3d0c52be5b6cf4fc67125baa714d3a.diff
LOG: [x86] allow peeking through an extract_subvector to find a splatted operand
The motivating case is seen in "splat4_v8f32_load_store" and based on code in PR42024:
https://bugs.llvm.org/show_bug.cgi?id=42024
(I haven't stepped through the v8i32 sibling test yet to see why that diverged.)
There are other potential improvements visible like allowing scalarization or vector
narrowing.
Differential Revision: https://reviews.llvm.org/D74909
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/avx-splat.ll
llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
llvm/test/CodeGen/X86/extractelement-load.ll
llvm/test/CodeGen/X86/fma.ll
llvm/test/CodeGen/X86/insertelement-var-index.ll
llvm/test/CodeGen/X86/masked_gather.ll
llvm/test/CodeGen/X86/pr34653.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 981d27969a95..8dfb94b1328f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12867,6 +12867,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
// Go up the chain of (vector) values to find a scalar load that we can
// combine with the broadcast.
+ // TODO: Combine this logic with findEltLoadSrc() used by
+ // EltsFromConsecutiveLoads().
int BitOffset = BroadcastIdx * NumEltBits;
SDValue V = V1;
for (;;) {
@@ -12882,6 +12884,19 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
BitOffset %= OpBitWidth;
continue;
}
+ case ISD::EXTRACT_SUBVECTOR: {
+ auto *ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(1));
+ if (!ConstantIdx)
+ break;
+
+ // The extraction index adds to the existing offset.
+ unsigned EltBitWidth = V.getScalarValueSizeInBits();
+ unsigned Idx = ConstantIdx->getZExtValue();
+ unsigned BeginOffset = Idx * EltBitWidth;
+ BitOffset += BeginOffset;
+ V = V.getOperand(0);
+ continue;
+ }
case ISD::INSERT_SUBVECTOR: {
SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 1e6d0e8766b3..746b068cec2f 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -1965,7 +1965,7 @@ define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
; X64-LABEL: test_mm256_set1_epi64x:
; X64: # %bb.0:
; X64-NEXT: vmovq %rdi, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll
index 3751fccb09d4..26fefb26fff4 100644
--- a/llvm/test/CodeGen/X86/avx-splat.ll
+++ b/llvm/test/CodeGen/X86/avx-splat.ll
@@ -34,7 +34,7 @@ define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
; X64-LABEL: funcC:
; X64: # %bb.0: # %entry
; X64-NEXT: vmovq %rdi, %xmm0
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 198694a30143..8bb063a8738d 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -2161,11 +2161,11 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,3,4]
-; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm4
+; CHECK-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [2,0,3,4,2,0,3,4]
+; CHECK-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm0
; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -2176,11 +2176,10 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,3,4]
; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vpermi2q %ymm3, %ymm0, %ymm2 {%k1} {z}
-; CHECK-NEXT: vmovdqa %ymm2, %ymm0
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 2, i32 0, i32 3, i32 4>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
@@ -4351,17 +4350,13 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(<8 x doub
ret <2 x double> %res
}
-; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD.
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %vec2, <2 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [1,4,1,4,1,4,1,4]
-; CHECK-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm2
+; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1}
-; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
@@ -4370,16 +4365,13 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double
ret <2 x double> %res
}
-; TODO - we'd be better off splitting the load to 2*xmm and performing a VSHUFPD.
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(<8 x double>* %vp, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd {{.*#+}} xmm1 = [1,4]
+; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
; CHECK-NEXT: retq
%vec = load <8 x double>, <8 x double>* %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index aa71d3749cae..0608d1809ed6 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -204,8 +204,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
;
; AVX1-LABEL: ext_i4_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -431,8 +432,9 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
;
; AVX1-LABEL: ext_i8_8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index af1abe71e2f2..9eec82cd6652 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -260,8 +260,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
;
; AVX1-LABEL: ext_i4_4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -554,8 +555,9 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
;
; AVX1-LABEL: ext_i8_8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index 6b5dd7ee3f10..332fea81adff 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -161,7 +161,7 @@ define float @t6(<8 x float> *%a0) {
;
; X64-AVX-LABEL: t6:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3]
+; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1
; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero
@@ -204,7 +204,7 @@ define void @PR43971(<8 x float> *%a0, float *%a1) {
;
; X64-AVX-LABEL: PR43971:
; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: vpermilpd $1, 16(%rdi), %xmm0 # xmm0 = mem[1,0]
+; X64-AVX-NEXT: vmovss 24(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vcmpltss %xmm0, %xmm1, %xmm1
; X64-AVX-NEXT: vmovss (%rsi), %xmm2 # xmm2 = mem[0],zero,zero,zero
@@ -252,7 +252,7 @@ define float @PR43971_1(<8 x float> *%a0) nounwind {
;
; X64-AVX-LABEL: PR43971_1:
; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: vmovshdup (%rdi), %xmm0 # xmm0 = mem[1,1,3,3]
+; X64-AVX-NEXT: vmovss 4(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1
; X64-AVX-NEXT: vmovss {{.*}}(%rip), %xmm2 # xmm2 = mem[0],zero,zero,zero
diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll
index 78ee863d19f9..a687bfd43fa6 100644
--- a/llvm/test/CodeGen/X86/fma.ll
+++ b/llvm/test/CodeGen/X86/fma.ll
@@ -1836,7 +1836,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: ## imm = 0x160
; FMACALL32_BDVER2-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38]
+; FMACALL32_BDVER2-NEXT: vmovsd 56(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x38]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
@@ -1846,7 +1847,7 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x29,0x8c,0x24,0xa0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x94,0x24,0x30,0x01,0x00,0x00]
-; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm1, %xmm0 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc8,0x01]
; FMACALL32_BDVER2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 16-byte Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x29,0x84,0x24,0x40,0x01,0x00,0x00]
@@ -1856,10 +1857,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT: vmovsd 48(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x30]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@@ -1869,10 +1871,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovaps 40(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x28]
+; FMACALL32_BDVER2-NEXT: vmovsd 40(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x28]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xa0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@@ -1882,10 +1885,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
+; FMACALL32_BDVER2-NEXT: vmovsd 32(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x20]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0x00,0x01,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vextractf128 $1, %ymm0, %xmm1 ## encoding: [0xc4,0xe3,0x7d,0x19,0xc1,0x01]
@@ -1902,10 +1906,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovaps 24(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x18]
+; FMACALL32_BDVER2-NEXT: vmovsd 24(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x18]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x44,0x24,0x30]
; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@@ -1914,10 +1919,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT: vmovsd 16(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x10]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x30]
-; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpckhpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@@ -1927,10 +1933,11 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovaps 8(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x08]
+; FMACALL32_BDVER2-NEXT: vmovsd 8(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x08]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
; FMACALL32_BDVER2-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
; FMACALL32_BDVER2-NEXT: ## encoding: [0xdb,0x7c,0x24,0x20]
-; FMACALL32_BDVER2-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x13,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm0 ## 32-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xfc,0x28,0x84,0x24,0xe0,0x00,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vunpcklpd {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
@@ -1940,8 +1947,9 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
; FMACALL32_BDVER2-NEXT: vzeroupper ## encoding: [0xc5,0xf8,0x77]
; FMACALL32_BDVER2-NEXT: calll _fma ## encoding: [0xe8,A,A,A,A]
; FMACALL32_BDVER2-NEXT: ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT: vmovaps 56(%ebp), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x45,0x38]
-; FMACALL32_BDVER2-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xf8,0x17,0x44,0x24,0x10]
+; FMACALL32_BDVER2-NEXT: vmovsd 64(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x40]
+; FMACALL32_BDVER2-NEXT: ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT: vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
; FMACALL32_BDVER2-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
; FMACALL32_BDVER2-NEXT: ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00]
; FMACALL32_BDVER2-NEXT: vmovlps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0, %xmm0 ## 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll
index a37fe63944de..564c789c9880 100644
--- a/llvm/test/CodeGen/X86/insertelement-var-index.ll
+++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll
@@ -376,7 +376,7 @@ define <4 x i64> @arg_i64_v4i64(i64 %x, i32 %y) nounwind {
; AVX1-LABEL: arg_i64_v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index 138b33d55b6a..44dcf90c07b5 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -1244,57 +1244,77 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
;
; AVX1-LABEL: gather_v8i32_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: movl $c, %eax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,1]
-; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm3, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4
+; AVX1-NEXT: movl $c, %ecx
+; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
+; AVX1-NEXT: vpaddq {{.*}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm9, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm9, %xmm4
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: # implicit-def: $ymm1
; AVX1-NEXT: je .LBB4_2
; AVX1-NEXT: # %bb.1: # %cond.load
-; AVX1-NEXT: vmovq %xmm4, %rcx
-; AVX1-NEXT: vmovd (%rcx), %xmm1 # xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovq %xmm3, %rdx
+; AVX1-NEXT: vmovd (%rdx), %xmm1 # xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: .LBB4_2: # %else
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_4
; AVX1-NEXT: # %bb.3: # %cond.load1
-; AVX1-NEXT: vpextrq $1, %xmm4, %rcx
-; AVX1-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vpextrq $1, %xmm3, %rdx
+; AVX1-NEXT: vpinsrd $1, (%rdx), %xmm1, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: .LBB4_4: # %else2
; AVX1-NEXT: testb $4, %al
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
-; AVX1-NEXT: jne .LBB4_5
-; AVX1-NEXT: # %bb.6: # %else5
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: je .LBB4_6
+; AVX1-NEXT: # %bb.5: # %cond.load4
+; AVX1-NEXT: vmovq %xmm4, %rdx
+; AVX1-NEXT: vpinsrd $2, (%rdx), %xmm1, %xmm5
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: .LBB4_6: # %else5
+; AVX1-NEXT: vmovq %rcx, %xmm5
; AVX1-NEXT: testb $8, %al
-; AVX1-NEXT: jne .LBB4_7
+; AVX1-NEXT: je .LBB4_8
+; AVX1-NEXT: # %bb.7: # %cond.load7
+; AVX1-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX1-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm6
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: .LBB4_8: # %else8
+; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = xmm5[0,0]
; AVX1-NEXT: testb $16, %al
-; AVX1-NEXT: jne .LBB4_9
+; AVX1-NEXT: je .LBB4_10
+; AVX1-NEXT: # %bb.9: # %cond.load10
+; AVX1-NEXT: vmovq %xmm3, %rcx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpinsrd $0, (%rcx), %xmm6, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
; AVX1-NEXT: .LBB4_10: # %else11
; AVX1-NEXT: testb $32, %al
-; AVX1-NEXT: jne .LBB4_11
+; AVX1-NEXT: je .LBB4_12
+; AVX1-NEXT: # %bb.11: # %cond.load13
+; AVX1-NEXT: vpextrq $1, %xmm3, %rcx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrd $1, (%rcx), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: .LBB4_12: # %else14
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB4_14
-; AVX1-NEXT: .LBB4_13: # %cond.load16
-; AVX1-NEXT: vmovq %xmm6, %rcx
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpinsrd $2, (%rcx), %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX1-NEXT: # %bb.13: # %cond.load16
+; AVX1-NEXT: vmovq %xmm4, %rcx
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpinsrd $2, (%rcx), %xmm3, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: .LBB4_14: # %else17
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm5
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB4_16
; AVX1-NEXT: # %bb.15: # %cond.load19
-; AVX1-NEXT: vpextrq $1, %xmm6, %rax
+; AVX1-NEXT: vpextrq $1, %xmm4, %rax
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX1-NEXT: vpinsrd $3, (%rax), %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
@@ -1455,33 +1475,6 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB4_5: # %cond.load4
-; AVX1-NEXT: vmovq %xmm6, %rcx
-; AVX1-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: testb $8, %al
-; AVX1-NEXT: je .LBB4_8
-; AVX1-NEXT: .LBB4_7: # %cond.load7
-; AVX1-NEXT: vpextrq $1, %xmm6, %rcx
-; AVX1-NEXT: vpinsrd $3, (%rcx), %xmm1, %xmm5
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX1-NEXT: testb $16, %al
-; AVX1-NEXT: je .LBB4_10
-; AVX1-NEXT: .LBB4_9: # %cond.load10
-; AVX1-NEXT: vmovq %xmm4, %rcx
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpinsrd $0, (%rcx), %xmm5, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
-; AVX1-NEXT: testb $32, %al
-; AVX1-NEXT: je .LBB4_12
-; AVX1-NEXT: .LBB4_11: # %cond.load13
-; AVX1-NEXT: vpextrq $1, %xmm4, %rcx
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpinsrd $1, (%rcx), %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT: testb $64, %al
-; AVX1-NEXT: jne .LBB4_13
-; AVX1-NEXT: jmp .LBB4_14
;
; AVX2-LABEL: gather_v8i32_v8i32:
; AVX2: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/pr34653.ll b/llvm/test/CodeGen/X86/pr34653.ll
index 72843a5052be..9a0b56a90cb7 100644
--- a/llvm/test/CodeGen/X86/pr34653.ll
+++ b/llvm/test/CodeGen/X86/pr34653.ll
@@ -17,116 +17,48 @@ define void @pr34653() {
; CHECK-NEXT: callq test
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm1
-; CHECK-NEXT: vmovaps %xmm1, %xmm2
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm2
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm3
-; CHECK-NEXT: vmovaps %xmm3, %xmm4
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm4
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm5
-; CHECK-NEXT: vmovaps %xmm5, %xmm6
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm6
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm7
-; CHECK-NEXT: vmovaps %xmm7, %xmm8
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm8
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm9
-; CHECK-NEXT: vmovaps %xmm9, %xmm10
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm10
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm11
-; CHECK-NEXT: vmovaps %xmm11, %xmm12
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm12
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm13
-; CHECK-NEXT: vmovaps %xmm13, %xmm14
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm14
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm15
-; CHECK-NEXT: vmovaps %zmm15, %zmm16
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm17
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm18
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm19
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm20
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm21
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm22
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: vmovaps %zmm0, %zmm23
-; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0]
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT: vmovsd {{.*#+}} xmm16 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm17 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm18 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm19 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm20 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm21 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm22 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm23 = mem[0],zero
; CHECK-NEXT: vmovsd {{.*#+}} xmm24 = mem[0],zero
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 8-byte Reload
-; CHECK-NEXT: # xmm25 = mem[0],zero
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 8-byte Reload
-; CHECK-NEXT: # xmm26 = mem[0],zero
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 8-byte Reload
-; CHECK-NEXT: # xmm27 = mem[0],zero
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 8-byte Reload
-; CHECK-NEXT: # xmm28 = mem[0],zero
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 8-byte Reload
-; CHECK-NEXT: # xmm29 = mem[0],zero
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 8-byte Reload
-; CHECK-NEXT: # xmm30 = mem[0],zero
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 8-byte Reload
-; CHECK-NEXT: # xmm31 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm25 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm26 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm27 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm28 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm29 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm30 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm31 = mem[0],zero
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
-; CHECK-NEXT: # xmm0 = mem[0],zero
+; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
; CHECK-NEXT: # xmm0 = mem[0],zero
@@ -142,7 +74,6 @@ define void @pr34653() {
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
-; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
entry:
%v = call fastcc <38 x double> @test()
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index c59d31911ad5..54266b12864f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -65,7 +65,6 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_
define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) {
; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38:
; KNL: ## %bb.0:
-; KNL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
; KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; KNL-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
; KNL-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7],ymm2[8,9,10,11],ymm3[12,13],ymm2[14],ymm3[15]
@@ -74,7 +73,7 @@ define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_1
; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7],ymm0[8,9,10,11,12],ymm4[13,14,15]
; KNL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,u,u,12,13,u,u,10,11,u,u,8,9,u,u,22,23,u,u,20,21,u,u,18,19,u,u,16,17,u,u]
; KNL-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7],ymm0[8],ymm3[9],ymm0[10],ymm3[11],ymm0[12],ymm3[13],ymm0[14],ymm3[15]
-; KNL-NEXT: vextracti128 $1, %ymm1, %xmm1
+; KNL-NEXT: vextracti32x4 $3, %zmm1, %xmm1
; KNL-NEXT: vpbroadcastw %xmm1, %ymm1
; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7],ymm3[8,9,10,11,12,13,14],ymm1[15]
; KNL-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index fc81e8658538..58cef0725c0d 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1707,24 +1707,22 @@ define void @splat2_v4i64_load_store(<4 x i64>* %s, <8 x i64>* %d) {
define void @splat4_v8f32_load_store(<8 x float>* %s, <32 x float>* %d) {
; AVX1-LABEL: splat4_v8f32_load_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovups (%rdi), %xmm0
-; AVX1-NEXT: vmovups 16(%rdi), %xmm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[1,1,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-NEXT: vmovups %xmm0, 48(%rsi)
-; AVX1-NEXT: vmovups %xmm7, 32(%rsi)
-; AVX1-NEXT: vmovups %xmm6, 16(%rsi)
-; AVX1-NEXT: vmovups %xmm5, (%rsi)
-; AVX1-NEXT: vmovups %xmm1, 112(%rsi)
-; AVX1-NEXT: vmovups %xmm4, 96(%rsi)
-; AVX1-NEXT: vmovups %xmm3, 80(%rsi)
-; AVX1-NEXT: vmovups %xmm2, 64(%rsi)
+; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm0
+; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm2
+; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm3
+; AVX1-NEXT: vbroadcastss (%rdi), %xmm4
+; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm5
+; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm6
+; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm7
+; AVX1-NEXT: vmovups %xmm7, 48(%rsi)
+; AVX1-NEXT: vmovups %xmm6, 32(%rsi)
+; AVX1-NEXT: vmovups %xmm5, 16(%rsi)
+; AVX1-NEXT: vmovups %xmm4, (%rsi)
+; AVX1-NEXT: vmovups %xmm3, 112(%rsi)
+; AVX1-NEXT: vmovups %xmm2, 96(%rsi)
+; AVX1-NEXT: vmovups %xmm1, 80(%rsi)
+; AVX1-NEXT: vmovups %xmm0, 64(%rsi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat4_v8f32_load_store:
@@ -1768,24 +1766,23 @@ define void @splat4_v8f32_load_store(<8 x float>* %s, <32 x float>* %d) {
define void @splat4_v8i32_load_store(<8 x i32>* %s, <32 x i32>* %d) {
; AVX1-LABEL: splat4_v8i32_load_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovups (%rdi), %xmm0
-; AVX1-NEXT: vmovups 16(%rdi), %xmm1
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,1,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[1,1,1,1]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-NEXT: vmovups %xmm0, 48(%rsi)
-; AVX1-NEXT: vmovups %xmm7, 32(%rsi)
-; AVX1-NEXT: vmovups %xmm6, 16(%rsi)
-; AVX1-NEXT: vmovups %xmm5, (%rsi)
-; AVX1-NEXT: vmovups %xmm1, 112(%rsi)
-; AVX1-NEXT: vmovups %xmm4, 96(%rsi)
-; AVX1-NEXT: vmovups %xmm3, 80(%rsi)
-; AVX1-NEXT: vmovups %xmm2, 64(%rsi)
+; AVX1-NEXT: vbroadcastss (%rdi), %xmm0
+; AVX1-NEXT: vbroadcastss 4(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss 8(%rdi), %xmm1
+; AVX1-NEXT: vbroadcastss 12(%rdi), %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vbroadcastss 16(%rdi), %xmm2
+; AVX1-NEXT: vbroadcastss 20(%rdi), %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastss 24(%rdi), %xmm3
+; AVX1-NEXT: vbroadcastss 28(%rdi), %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vmovups %ymm3, 96(%rsi)
+; AVX1-NEXT: vmovups %ymm2, 64(%rsi)
+; AVX1-NEXT: vmovups %ymm1, 32(%rsi)
+; AVX1-NEXT: vmovups %ymm0, (%rsi)
+; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: splat4_v8i32_load_store:
More information about the llvm-commits
mailing list