[llvm] fa9c12e - [X86] Attempt to combine binary shuffles where both operands come from the same larger vector
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 13 06:34:44 PDT 2022
Author: Simon Pilgrim
Date: 2022-10-13T14:34:11+01:00
New Revision: fa9c12ed964b8201e142d78e430ad4c76bd7af62
URL: https://github.com/llvm/llvm-project/commit/fa9c12ed964b8201e142d78e430ad4c76bd7af62
DIFF: https://github.com/llvm/llvm-project/commit/fa9c12ed964b8201e142d78e430ad4c76bd7af62.diff
LOG: [X86] Attempt to combine binary shuffles where both operands come from the same larger vector
Allows us to use combineX86ShuffleChainWithExtract to combine targetshuffle(low_subvector(x),high_subvector(x)) -> low_subvector(targetshuffle(x)) style patterns
This is currently very limited (it must have a v2i64/v2f64 result), but while triaging I noticed we might be able to extend this to allow more types for targets with suitable variable cross lane shuffle support.
Fixes #58339
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
llvm/test/CodeGen/X86/vector-shuffle-combining.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c9942ab22cd7c..f3668b339fe30 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39550,10 +39550,22 @@ static SDValue combineX86ShufflesRecursively(
std::swap(Ops[0], Ops[1]);
}
- // Finally, try to combine into a single shuffle instruction.
- return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
- AllowVariableCrossLaneMask,
- AllowVariablePerLaneMask, DAG, Subtarget);
+ // Try to combine into a single shuffle instruction.
+ if (SDValue Shuffle = combineX86ShuffleChain(
+ Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
+ AllowVariablePerLaneMask, DAG, Subtarget))
+ return Shuffle;
+
+ // If all the operands come from the same larger vector, fallthrough and try
+ // to use combineX86ShuffleChainWithExtract.
+ SDValue LHS = peekThroughBitcasts(Ops.front());
+ SDValue RHS = peekThroughBitcasts(Ops.back());
+ if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
+ (RootSizeInBits / Mask.size()) != 64 ||
+ LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+ LHS.getOperand(0) != RHS.getOperand(0))
+ return SDValue();
}
// If that failed and any input is extracted then try to combine as a
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index a182dee05cd5e..fb2210366146d 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -974,8 +974,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i
define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
@@ -989,8 +988,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32
define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3]
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: vzeroupper
@@ -2244,7 +2242,6 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
ret <4 x i64> %res
}
-
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7:
; CHECK: # %bb.0:
@@ -2259,11 +2256,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
ret <4 x i64> %res
}
+
define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
@@ -2272,10 +2270,9 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) {
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7]
; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT: valignq {{.*#+}} xmm1 {%k1} = xmm3[1],xmm0[0]
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
@@ -2283,13 +2280,12 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64
%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
ret <2 x i64> %res
}
-
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: valignq {{.*#+}} xmm0 {%k1} {z} = xmm2[1],xmm0[0]
+; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 3, i32 0>
@@ -2297,6 +2293,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i
%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
ret <2 x i64> %res
}
+
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
; CHECK: # %bb.0:
@@ -2311,7 +2308,6 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64
%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
ret <2 x i64> %res
}
-
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
; CHECK: # %bb.0:
@@ -2326,6 +2322,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i
%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
ret <2 x i64> %res
}
+
define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0:
; CHECK: # %bb.0:
@@ -2347,7 +2344,6 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
ret <4 x i64> %res
}
-
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0:
; CHECK: # %bb.0:
@@ -2697,8 +2693,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64>
define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovaps 32(%rdi), %xmm0
-; CHECK-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
+; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4,1]
+; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2707,10 +2705,11 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2
-; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1]
+; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm2
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -2722,10 +2721,11 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 32(%rdi), %xmm1
-; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
+; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1]
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
+; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
@@ -4003,12 +4003,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double>
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5]
-; CHECK-FAST-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4
-; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0
-; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1
-; CHECK-FAST-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1}
+; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [1,5]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
+; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
+; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
+; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
; CHECK-FAST-NEXT: retq
;
; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
@@ -4029,12 +4029,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm2
-; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,1,5,5]
-; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm1, %k1
-; CHECK-FAST-NEXT: vpermt2pd %ymm2, %ymm3, %ymm0 {%k1} {z}
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [1,5]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0
+; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
+; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
; CHECK-FAST-NEXT: retq
;
; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 3f2a8098a6564..6d2553fb9eb95 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -2204,13 +2204,21 @@ define <4 x double> @test_v8f64_2346 (<8 x double> %v) {
;FIXME: compressp
define <2 x double> @test_v8f64_34 (<8 x double> %v) {
-; ALL-LABEL: test_v8f64_34:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: ret{{[l|q]}}
+; AVX512F-LABEL: test_v8f64_34:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [3,4]
+; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512F-32-LABEL: test_v8f64_34:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,4,0]
+; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512F-32-NEXT: vzeroupper
+; AVX512F-32-NEXT: retl
%res = shufflevector <8 x double> %v, <8 x double> undef, <2 x i32> <i32 3, i32 4>
ret <2 x double> %res
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 7438b0a7f5cb3..068cfff72a7d5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -636,13 +636,12 @@ define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1)
}
declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
-; TODO: Failure to merge vpunpcklqdq(vextracti128(x,0),vextracti128(x,1)) -> vpermq
define <8 x i16> @shuffle_combine_packusdw_permq_extract(<8 x i32> %a0) {
; CHECK-LABEL: shuffle_combine_packusdw_permq_extract:
; CHECK: # %bb.0:
; CHECK-NEXT: vpackusdw %ymm0, %ymm0, %ymm0
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: ret{{[l|q]}}
%1 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> poison)
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index f587720e477c9..b8fac4d73546d 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1566,14 +1566,23 @@ define <4 x i32> @combine_test21(<8 x i32> %a, ptr %ptr) {
; SSE-NEXT: movaps %xmm2, (%rdi)
; SSE-NEXT: retq
;
-; AVX-LABEL: combine_test21:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX-NEXT: vmovaps %xmm2, (%rdi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: combine_test21:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1-NEXT: vmovaps %xmm2, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: combine_test21:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,3,2,3]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vmovaps %xmm0, (%rdi)
+; AVX2-NEXT: vmovaps %xmm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
%1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
store <4 x i32> %1, ptr %ptr, align 16
More information about the llvm-commits
mailing list