[llvm] e9015bd - [X86][AVX] lowerShuffleAsBroadcast - MOVDDUP(SCALAR_TO_VECTOR(X)) -> BROADCAST(X)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 24 04:32:11 PDT 2021
Author: Simon Pilgrim
Date: 2021-03-24T11:31:56Z
New Revision: e9015bd59519e205c2205fa413c8af7e677cc65d
URL: https://github.com/llvm/llvm-project/commit/e9015bd59519e205c2205fa413c8af7e677cc65d
DIFF: https://github.com/llvm/llvm-project/commit/e9015bd59519e205c2205fa413c8af7e677cc65d.diff
LOG: [X86][AVX] lowerShuffleAsBroadcast - MOVDDUP(SCALAR_TO_VECTOR(X)) -> BROADCAST(X)
Prefer broadcast from scalar on AVX targets as this makes it easier for later folds to strip away bitcasts etc.
This helps a lot with the AVX1 poor codegen from PR49658.
There's a trivial regression in bitcast-int-to-vector-bool-*ext.ll tests due to SimplifyDemandedBits not being able to see a multi-use case, but there's bigger existing codegen issues to be addressed first in those tests (unnecessary NOTs).
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
llvm/test/CodeGen/X86/avx-splat.ll
llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
llvm/test/CodeGen/X86/combine-bitselect.ll
llvm/test/CodeGen/X86/combine-pmuldq.ll
llvm/test/CodeGen/X86/insertelement-var-index.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f1a3a685df20..bdb09b919a39 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -13705,9 +13705,15 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
V = extract128BitVector(V, ExtractIdx, DAG, DL);
}
- if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
- DAG.getBitcast(MVT::f64, V));
+ // On AVX we can use VBROADCAST directly for scalar sources.
+ if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
+ V = DAG.getBitcast(MVT::f64, V);
+ if (Subtarget.hasAVX()) {
+ V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
+ return DAG.getBitcast(VT, V);
+ }
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
+ }
// If this is a scalar, do the broadcast on this type and bitcast.
if (!V.getValueType().isVector()) {
diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index 35c449e813c0..7e9a727e7230 100644
--- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -1965,7 +1965,7 @@ define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
; X64-LABEL: test_mm256_set1_epi64x:
; X64: # %bb.0:
; X64-NEXT: vmovq %rdi, %xmm0
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
%res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll
index 7602975c8872..1890b44eb075 100644
--- a/llvm/test/CodeGen/X86/avx-splat.ll
+++ b/llvm/test/CodeGen/X86/avx-splat.ll
@@ -34,7 +34,7 @@ define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
; X64-LABEL: funcC:
; X64: # %bb.0: # %entry
; X64-NEXT: vmovq %rdi, %xmm0
-; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 471298492735..64d12cc190d9 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -205,7 +205,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX1-LABEL: ext_i4_4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -430,7 +430,8 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
;
; AVX1-LABEL: ext_i8_8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index d014798c78c4..0d04e0a21466 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -261,7 +261,7 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
; AVX1-LABEL: ext_i4_4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
@@ -553,7 +553,8 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
;
; AVX1-LABEL: ext_i8_8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovd %edi, %xmm0
+; AVX1-NEXT: # kill: def $edi killed $edi def $rdi
+; AVX1-NEXT: vmovq %rdi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
; AVX1-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll
index d57bd877500c..614d134173e7 100644
--- a/llvm/test/CodeGen/X86/combine-bitselect.ll
+++ b/llvm/test/CodeGen/X86/combine-bitselect.ll
@@ -505,26 +505,18 @@ define <4 x i64> @bitselect_v4i64_broadcast_rrr(<4 x i64> %a0, <4 x i64> %a1, i6
; XOP-LABEL: bitselect_v4i64_broadcast_rrr:
; XOP: # %bb.0:
; XOP-NEXT: vmovq %rdi, %xmm2
-; XOP-NEXT: vmovq %rdi, %xmm3
-; XOP-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
-; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
-; XOP-NEXT: vandps %ymm2, %ymm0, %ymm0
-; XOP-NEXT: vandnps %ymm1, %ymm3, %ymm1
-; XOP-NEXT: vorps %ymm1, %ymm0, %ymm0
+; XOP-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0
; XOP-NEXT: retq
;
; AVX1-LABEL: bitselect_v4i64_broadcast_rrr:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm2
-; AVX1-NEXT: vmovq %rdi, %xmm3
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -881,32 +873,22 @@ define <8 x i64> @bitselect_v8i64_broadcast_rrr(<8 x i64> %a0, <8 x i64> %a1, i6
; XOP-LABEL: bitselect_v8i64_broadcast_rrr:
; XOP: # %bb.0:
; XOP-NEXT: vmovq %rdi, %xmm4
-; XOP-NEXT: vmovq %rdi, %xmm5
-; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm4[0,0]
+; XOP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; XOP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
-; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
-; XOP-NEXT: vandps %ymm4, %ymm1, %ymm1
-; XOP-NEXT: vandps %ymm4, %ymm0, %ymm0
-; XOP-NEXT: vandnps %ymm3, %ymm5, %ymm3
-; XOP-NEXT: vorps %ymm3, %ymm1, %ymm1
-; XOP-NEXT: vandnps %ymm2, %ymm5, %ymm2
-; XOP-NEXT: vorps %ymm2, %ymm0, %ymm0
+; XOP-NEXT: vpcmov %ymm4, %ymm2, %ymm0, %ymm0
+; XOP-NEXT: vpcmov %ymm4, %ymm3, %ymm1, %ymm1
; XOP-NEXT: retq
;
; AVX1-LABEL: bitselect_v8i64_broadcast_rrr:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm4
-; AVX1-NEXT: vmovq %rdi, %xmm5
-; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm4[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5
; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vandnps %ymm3, %ymm5, %ymm3
+; AVX1-NEXT: vandnps %ymm3, %ymm4, %ymm3
; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2
+; AVX1-NEXT: vandnps %ymm2, %ymm4, %ymm2
; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 7868c8b21a93..ae619ab590ec 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -372,43 +372,28 @@ define <8 x i32> @PR49658_zext(i32* %ptr, i32 %mul) {
; AVX1: # %bb.0: # %start
; AVX1-NEXT: movl %esi, %eax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
-; AVX1-NEXT: vpsrlq $32, %xmm9, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB7_1: # %loop
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm7 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm3
-; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpsllq $32, %xmm7, %xmm7
-; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm7
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2
-; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; AVX1-NEXT: vpmuludq %xmm5, %xmm9, %xmm3
-; AVX1-NEXT: vpmuludq %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6
-; AVX1-NEXT: vpmuludq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,3],xmm3[1,3]
+; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3]
+; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: subq $-128, %rax
; AVX1-NEXT: jne .LBB7_1
; AVX1-NEXT: # %bb.2: # %end
@@ -564,55 +549,28 @@ define <8 x i32> @PR49658_sext(i32* %ptr, i32 %mul) {
; AVX1: # %bb.0: # %start
; AVX1-NEXT: movslq %esi, %rax
; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX1-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
-; AVX1-NEXT: vpsrlq $32, %xmm9, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB8_1: # %loop
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX1-NEXT: vpmovsxdq 2097152(%rdi,%rax), %xmm4
-; AVX1-NEXT: vpmovsxdq 2097160(%rdi,%rax), %xmm5
-; AVX1-NEXT: vpmovsxdq 2097168(%rdi,%rax), %xmm6
-; AVX1-NEXT: vpmovsxdq 2097176(%rdi,%rax), %xmm7
-; AVX1-NEXT: vpsrlq $32, %xmm7, %xmm3
-; AVX1-NEXT: vpmuludq %xmm3, %xmm9, %xmm3
-; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm7, %xmm9, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm6, %xmm3
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm7
-; AVX1-NEXT: vpmuludq %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[1,3],xmm2[1,3]
-; AVX1-NEXT: vpmuludq %xmm5, %xmm8, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm6
-; AVX1-NEXT: vpmuludq %xmm6, %xmm9, %xmm6
-; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpaddq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm5
-; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm6
-; AVX1-NEXT: vpmuludq %xmm6, %xmm1, %xmm6
-; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm4
-; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,3],xmm3[1,3]
+; AVX1-NEXT: vpmovsxdq 2097152(%rdi,%rax), %xmm3
+; AVX1-NEXT: vpmovsxdq 2097160(%rdi,%rax), %xmm4
+; AVX1-NEXT: vpmovsxdq 2097168(%rdi,%rax), %xmm5
+; AVX1-NEXT: vpmovsxdq 2097176(%rdi,%rax), %xmm6
+; AVX1-NEXT: vpmuldq %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vpmuldq %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3]
+; AVX1-NEXT: vpmuldq %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpmuldq %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3],xmm4[1,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpaddd %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: subq $-128, %rax
; AVX1-NEXT: jne .LBB8_1
; AVX1-NEXT: # %bb.2: # %end
diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll
index c88f294926a2..a0b7df81d580 100644
--- a/llvm/test/CodeGen/X86/insertelement-var-index.ll
+++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll
@@ -461,7 +461,7 @@ define <4 x i64> @arg_i64_v4i64_undef(i64 %x, i32 %y) nounwind {
; AVX1-LABEL: arg_i64_v4i64_undef:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovq %rdi, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1422,7 +1422,7 @@ define <4 x double> @arg_f64_v4f64(<4 x double> %v, double %x, i32 %y) nounwind
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
; AVX1-NEXT: movslq %edi, %rax
; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = xmm2[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX1-NEXT: vpcmpeqq {{.*}}(%rip), %xmm2, %xmm3
; AVX1-NEXT: vpcmpeqq {{\.LCPI.*}}+{{.*}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
@@ -1704,7 +1704,7 @@ define <4 x double> @load_f64_v4f64(<4 x double> %v, double* %p, i32 %y) nounwin
; AVX1: # %bb.0:
; AVX1-NEXT: movslq %esi, %rax
; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX1-NEXT: vpcmpeqq {{.*}}(%rip), %xmm1, %xmm2
; AVX1-NEXT: vpcmpeqq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
More information about the llvm-commits
mailing list