[llvm-branch-commits] [llvm] DAG: Avoid forming shufflevector from a single extract_vector_elt (PR #122672)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jan 14 08:42:04 PST 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/122672
>From acdbd89a32c585668dc6ad9797a9b7f578f84776 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 10 Jan 2025 21:13:09 +0700
Subject: [PATCH] DAG: Avoid forming shufflevector from a single
extract_vector_elt
This avoids regressions in a future AMDGPU commit. Previously we
would have a build_vector (extract_vector_elt x), undef with free
access to the elements bloated into a shuffle of one element + undef,
which has much worse combine support than the extract.
Alternatively could check aggressivelyPreferBuildVectorSources, but
I'm not sure it's really different than isExtractVecEltCheap.
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 ++-
.../CodeGen/AMDGPU/insert_vector_dynelt.ll | 10 +-
llvm/test/CodeGen/X86/avx512-build-vector.ll | 8 +-
.../X86/avx512-shuffles/partial_permute.ll | 157 ++++++++++--------
.../CodeGen/X86/insertelement-duplicates.ll | 10 +-
llvm/test/CodeGen/X86/sse-align-12.ll | 4 +-
6 files changed, 123 insertions(+), 91 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 51381b85a5e1b6..5c10cf400d8bc9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23799,6 +23799,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
SmallVector<SDValue, 8> VecIn;
VecIn.push_back(SDValue());
+ // If we have a single extract_element with a constant index, track the index
+ // value.
+ unsigned OneConstExtractIndex = ~0u;
+
for (unsigned i = 0; i != NumElems; ++i) {
SDValue Op = N->getOperand(i);
@@ -23816,16 +23820,18 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// Not an undef or zero. If the input is something other than an
// EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
- if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
- !isa<ConstantSDNode>(Op.getOperand(1)))
+ if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
- SDValue ExtractedFromVec = Op.getOperand(0);
+ SDValue ExtractedFromVec = Op.getOperand(0);
if (ExtractedFromVec.getValueType().isScalableVector())
return SDValue();
+ auto *ExtractIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+ if (!ExtractIdx)
+ return SDValue();
- const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
- if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
+ if (ExtractIdx->getAsAPIntVal().uge(
+ ExtractedFromVec.getValueType().getVectorNumElements()))
return SDValue();
// All inputs must have the same element type as the output.
@@ -23833,6 +23839,8 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
ExtractedFromVec.getValueType().getVectorElementType())
return SDValue();
+ OneConstExtractIndex = ExtractIdx->getZExtValue();
+
// Have we seen this input vector before?
// The vectors are expected to be tiny (usually 1 or 2 elements), so using
// a map back from SDValues to numbers isn't worth it.
@@ -23855,6 +23863,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
// VecIn accordingly.
bool DidSplitVec = false;
if (VecIn.size() == 2) {
+ // If we only found a single constant indexed extract_vector_elt feeding the
+ // build_vector, do not produce a more complicated shuffle if the extract is
+ // cheap.
+ if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) &&
+ TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))
+ return SDValue();
+
unsigned MaxIndex = 0;
unsigned NearestPow2 = 0;
SDValue Vec = VecIn.back();
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 7912d1cf8dc0d1..add8c0f75bf335 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3
; GCN-NEXT: s_and_b32 s6, s4, 0x1010101
; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5]
; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3]
-; GCN-NEXT: v_mov_b32_e32 v3, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s2
-; GCN-NEXT: v_mov_b32_e32 v1, s3
-; GCN-NEXT: v_mov_b32_e32 v2, s0
-; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v3, s3
+; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-NEXT: s_endpgm
entry:
%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll
index b21a0c4e36c2bd..27cb3eb406e9e8 100644
--- a/llvm/test/CodeGen/X86/avx512-build-vector.ll
+++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll
@@ -14,11 +14,9 @@ define <16 x i32> @test2(<16 x i32> %x) {
define <16 x float> @test3(<4 x float> %a) {
; CHECK-LABEL: test3:
; CHECK: ## %bb.0:
-; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15]
-; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
-; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11,0,1,2,3],zero,zero,zero,zero
+; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%b = extractelement <4 x float> %a, i32 2
%c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index 5d901a8a380a9c..7aa7eea500fb10 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -2846,12 +2846,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,3,5,0]
-; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
-; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,3,5,3]
+; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
@@ -2863,11 +2863,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec,
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,0]
-; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,3]
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 0>
@@ -2878,12 +2879,12 @@ define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask1(<8 x float> %vec
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,2,7,0]
-; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
-; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [3,2,7,3]
+; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm3
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
@@ -2895,11 +2896,12 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec,
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask2(<8 x float> %vec, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,2,7,0]
-; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,2,7,3]
+; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm2
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 3, i32 2, i32 7, i32 0>
@@ -3885,10 +3887,12 @@ define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask0(ptr %vp,
define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
; CHECK-LABEL: test_masked_4xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd 16(%rdi), %xmm2
-; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT: vunpcklpd (%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vpermpd $226, (%rdi), %ymm0 {%k1} # ymm0 {%k1} = mem[2,0,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <4 x double>, ptr %vp
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -3900,10 +3904,11 @@ define <2 x double> @test_masked_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2
define <2 x double> @test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_4xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd 16(%rdi), %xmm1
-; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT: vunpcklpd (%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1
+; CHECK-NEXT: vpermpd $226, (%rdi), %ymm0 {%k1} {z} # ymm0 {%k1} {z} = mem[2,0,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <4 x double>, ptr %vp
%shuf = shufflevector <4 x double> %vec, <4 x double> undef, <2 x i32> <i32 2, i32 0>
@@ -4130,16 +4135,18 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double>
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [5,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm1
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [7,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0
+; CHECK-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-FAST-NEXT: retq
;
; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
-; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
+; CHECK-FAST-PERLANE-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
+; CHECK-FAST-PERLANE-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-FAST-PERLANE-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-FAST-PERLANE-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
ret <4 x double> %res
@@ -4147,21 +4154,23 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
-; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm3
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm4 = [7,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm4, %zmm0
+; CHECK-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-FAST-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
; CHECK-FAST-NEXT: retq
;
; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-FAST-PERLANE-NEXT: vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
+; CHECK-FAST-PERLANE-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
@@ -4173,20 +4182,23 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
-; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [5,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm2
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [7,0]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
+; CHECK-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
; CHECK-FAST-NEXT: retq
;
; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
+; CHECK-FAST-PERLANE-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
@@ -4493,9 +4505,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(ptr %vp,
define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %ymm1
-; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,2,1,0]
-; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm1, %ymm0
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,2]
+; CHECK-NEXT: vmovaps (%rdi), %zmm1
+; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm0
+; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
+; CHECK-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: retq
%vec = load <8 x double>, ptr %vp
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4504,12 +4519,15 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp) {
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %ymm2
-; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [4,2,1,0]
-; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm3
-; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1}
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,2]
+; CHECK-NEXT: vmovapd (%rdi), %zmm3
+; CHECK-NEXT: vpermpd %zmm3, %zmm2, %zmm2
+; CHECK-NEXT: vmovddup 8(%rdi), %xmm4 # xmm4 = mem[0,0]
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; CHECK-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1}
; CHECK-NEXT: retq
%vec = load <8 x double>, ptr %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
@@ -4521,12 +4539,15 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(ptr %vp, <4 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %ymm2
-; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [4,2,1,0]
-; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1
-; CHECK-NEXT: vpermi2pd 32(%rdi), %ymm2, %ymm1 {%k1} {z}
-; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,2]
+; CHECK-NEXT: vmovapd (%rdi), %zmm2
+; CHECK-NEXT: vpermpd %zmm2, %zmm1, %zmm1
+; CHECK-NEXT: vmovddup 8(%rdi), %xmm3 # xmm3 = mem[0,0]
+; CHECK-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1
+; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z}
; CHECK-NEXT: retq
%vec = load <8 x double>, ptr %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 4, i32 2, i32 1, i32 0>
diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll
index 435ea61412b73e..3da53a4ca5f1b9 100644
--- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll
+++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll
@@ -31,18 +31,16 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli
; AVX-32: # %bb.0: # %L.entry
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0
-; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
+; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
; AVX-32-NEXT: vmovups %ymm0, 608(%eax)
; AVX-32-NEXT: vzeroupper
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: PR15298:
; AVX-64: # %bb.0: # %L.entry
-; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0
-; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7]
+; AVX-64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1]
; AVX-64-NEXT: vmovups %ymm0, 608(%rsi)
; AVX-64-NEXT: vzeroupper
; AVX-64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sse-align-12.ll b/llvm/test/CodeGen/X86/sse-align-12.ll
index 7b4bd3ffdf00c5..0d5bdb0954ce31 100644
--- a/llvm/test/CodeGen/X86/sse-align-12.ll
+++ b/llvm/test/CodeGen/X86/sse-align-12.ll
@@ -40,8 +40,8 @@ define <4 x float> @b(ptr %y, <4 x float> %z) nounwind {
define <2 x double> @c(ptr %y) nounwind {
; CHECK-LABEL: c:
; CHECK: # %bb.0:
-; CHECK-NEXT: movups (%rdi), %xmm0
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; CHECK-NEXT: retq
%x = load <2 x double>, ptr %y, align 8
%a = extractelement <2 x double> %x, i32 0
More information about the llvm-branch-commits
mailing list