[llvm] 4972722 - [X86] lowerV4F64Shuffle - prefer lowerShuffleAsDecomposedShuffleMerge if we're blending inplace/splatable shuffle inputs on AVX2 targets (#126420)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 9 09:11:21 PST 2025
Author: Simon Pilgrim
Date: 2025-02-09T17:11:18Z
New Revision: 4972722f90deddf45c29958070bb1beb509e72ac
URL: https://github.com/llvm/llvm-project/commit/4972722f90deddf45c29958070bb1beb509e72ac
DIFF: https://github.com/llvm/llvm-project/commit/4972722f90deddf45c29958070bb1beb509e72ac.diff
LOG: [X86] lowerV4F64Shuffle - prefer lowerShuffleAsDecomposedShuffleMerge if we're blending inplace/splatable shuffle inputs on AVX2 targets (#126420)
More aggressively use broadcast instructions where possible
Fixes #50315
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
llvm/test/CodeGen/X86/horizontal-sum.ll
llvm/test/CodeGen/X86/matrix-multiply.ll
llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 744e4e740cb2102..9a916a663a64c20 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12689,6 +12689,20 @@ static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
return true;
}
+/// Test whether the specified input (0 or 1) is a broadcast/splat blended by
+/// the given mask.
+///
+static bool isShuffleMaskInputBroadcastable(int Input, ArrayRef<int> Mask,
+ int BroadcastableElement = 0) {
+ assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
+ int Size = Mask.size();
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && Mask[i] / Size == Input &&
+ Mask[i] % Size != BroadcastableElement)
+ return false;
+ return true;
+}
+
/// If we are extracting two 128-bit halves of a vector and shuffling the
/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
/// multi-shuffle lowering.
@@ -16190,6 +16204,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
+ bool V1IsSplat = isShuffleMaskInputBroadcastable(0, Mask);
+ bool V2IsSplat = isShuffleMaskInputBroadcastable(1, Mask);
// If we have lane crossing shuffles AND they don't all come from the lower
// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
@@ -16198,7 +16214,9 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
(V1.getOpcode() != ISD::BUILD_VECTOR) &&
- (V2.getOpcode() != ISD::BUILD_VECTOR))
+ (V2.getOpcode() != ISD::BUILD_VECTOR) &&
+ (!Subtarget.hasAVX2() ||
+ !((V1IsInPlace || V1IsSplat) && (V2IsInPlace || V2IsSplat))))
return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
// If we have one input in place, then we can permute the other input and
diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
index 1baaab0931cb9ad..26a88ab15e3cca1 100644
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -151,8 +151,8 @@ define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_un
define <4 x double> @vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary(<4 x double> %x, <4 x double> %y) nounwind {
; CHECK-LABEL: vec256_eltty_double_source_subvec_0_target_subvec_mask_2_binary:
; CHECK: # %bb.0:
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[2]
+; CHECK-NEXT: vbroadcastsd %xmm1, %ymm1
+; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; CHECK-NEXT: retq
%r = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
ret <4 x double> %r
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 5fe1e2996ee9b08..e2cc3ae0dca0af2 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -256,11 +256,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
-; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm1
+; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: pair_sum_v8f32_v4f32:
@@ -277,11 +277,11 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
-; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
+; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm1
+; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: retq
%9 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
%10 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 1, i32 3>
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index bdc1ff4c157e4fc..a38ca339cd5e133 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -659,57 +659,57 @@ define <9 x double> @test_mul3x3_f64(<9 x double> %a0, <9 x double> %a1) nounwin
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: movq %rdi, %rax
; AVX2-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm0
-; AVX2-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm4, %xmm3, %xmm10
-; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0
+; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm10
+; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm3, %xmm1, %xmm4
+; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4
; AVX2-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],xmm7[0]
; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm7, %xmm6, %xmm10
-; AVX2-NEXT: vaddpd %xmm0, %xmm10, %xmm0
+; AVX2-NEXT: vaddpd %xmm4, %xmm10, %xmm4
; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm9
-; AVX2-NEXT: vmulsd %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
+; AVX2-NEXT: vmulsd %xmm3, %xmm5, %xmm3
+; AVX2-NEXT: vaddsd %xmm3, %xmm9, %xmm3
; AVX2-NEXT: vmulsd %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vaddsd %xmm7, %xmm4, %xmm4
-; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm7, %xmm1, %xmm9
+; AVX2-NEXT: vaddsd %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX2-NEXT: vmovddup {{.*#+}} xmm4 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm4, %xmm0, %xmm7
+; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm1, %xmm9, %xmm10
+; AVX2-NEXT: vaddpd %xmm7, %xmm10, %xmm7
; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm3, %xmm10, %xmm11
-; AVX2-NEXT: vaddpd %xmm11, %xmm9, %xmm9
-; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm6, %xmm11, %xmm12
-; AVX2-NEXT: vaddpd %xmm12, %xmm9, %xmm9
-; AVX2-NEXT: vmulsd %xmm7, %xmm2, %xmm7
-; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm10
-; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7
-; AVX2-NEXT: vmulsd %xmm11, %xmm8, %xmm10
-; AVX2-NEXT: vaddsd %xmm7, %xmm10, %xmm7
+; AVX2-NEXT: vmulpd %xmm6, %xmm10, %xmm11
+; AVX2-NEXT: vaddpd %xmm7, %xmm11, %xmm7
+; AVX2-NEXT: vmulsd %xmm4, %xmm2, %xmm4
+; AVX2-NEXT: vmulsd %xmm5, %xmm9, %xmm9
+; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
+; AVX2-NEXT: vmulsd %xmm10, %xmm8, %xmm9
+; AVX2-NEXT: vaddsd %xmm4, %xmm9, %xmm4
+; AVX2-NEXT: vmovddup {{.*#+}} xmm9 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm0, %xmm9, %xmm0
; AVX2-NEXT: vmovddup {{.*#+}} xmm10 = mem[0,0]
; AVX2-NEXT: vmulpd %xmm1, %xmm10, %xmm1
-; AVX2-NEXT: vmovddup {{.*#+}} xmm11 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm3, %xmm11, %xmm3
-; AVX2-NEXT: vaddpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
-; AVX2-NEXT: vmulpd %xmm3, %xmm6, %xmm6
-; AVX2-NEXT: vaddpd %xmm6, %xmm1, %xmm1
-; AVX2-NEXT: vmulsd %xmm2, %xmm10, %xmm2
-; AVX2-NEXT: vmulsd %xmm5, %xmm11, %xmm5
+; AVX2-NEXT: vaddpd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0]
+; AVX2-NEXT: vmulpd %xmm1, %xmm6, %xmm6
+; AVX2-NEXT: vaddpd %xmm6, %xmm0, %xmm0
+; AVX2-NEXT: vmulsd %xmm2, %xmm9, %xmm2
+; AVX2-NEXT: vmulsd %xmm5, %xmm10, %xmm5
; AVX2-NEXT: vaddsd %xmm5, %xmm2, %xmm2
-; AVX2-NEXT: vmulsd %xmm3, %xmm8, %xmm3
-; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
-; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[2]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm3
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm9, %ymm1
-; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm3[0],ymm1[2],ymm3[3]
-; AVX2-NEXT: vmovsd %xmm2, 64(%rdi)
-; AVX2-NEXT: vmovapd %ymm1, 32(%rdi)
-; AVX2-NEXT: vmovapd %ymm0, (%rdi)
+; AVX2-NEXT: vmulsd %xmm1, %xmm8, %xmm1
+; AVX2-NEXT: vaddsd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vbroadcastsd %xmm7, %ymm2
+; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3]
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm3
+; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
+; AVX2-NEXT: vmovsd %xmm1, 64(%rdi)
+; AVX2-NEXT: vmovapd %ymm0, 32(%rdi)
+; AVX2-NEXT: vmovapd %ymm2, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 79602a18693dbed..00af58544e25c0b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -493,11 +493,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X86-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
; X86-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
; X86-AVX2-NEXT: vmovapd %ymm3, (%edx)
; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3]
@@ -520,13 +520,13 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
-; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
-; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X86-AVX512-NEXT: vpermt2pd %zmm4, %zmm5, %zmm6
-; X86-AVX512-NEXT: vmovapd %ymm6, (%edx)
+; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
+; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
+; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
+; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
+; X86-AVX512-NEXT: vmovapd %ymm4, (%edx)
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)
@@ -563,11 +563,11 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
;
; X64-AVX2-LABEL: PR48908:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
+; X64-AVX2-NEXT: vbroadcastsd %xmm1, %ymm3
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm2[0,1]
; X64-AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,1]
+; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
; X64-AVX2-NEXT: vmovapd %ymm3, (%rdi)
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm0[2,3]
@@ -587,16 +587,16 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; X64-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [1,2,8,9]
-; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm4
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,10,2,9]
-; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
-; X64-AVX512-NEXT: vpermt2pd %zmm3, %zmm5, %zmm6
-; X64-AVX512-NEXT: vmovapd %ymm6, (%rdi)
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,3,10,1]
-; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm4, %zmm3
-; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi)
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
+; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
+; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
+; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
+; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
+; X64-AVX512-NEXT: vmovapd %ymm4, (%rdi)
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
+; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
+; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi)
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,11]
; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
More information about the llvm-commits
mailing list