[llvm] 19c1682 - [X86] combineConcatVectorOps - concatenate 512-bit VPERMILPS nodes.
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 12 10:33:06 PST 2023
Author: Simon Pilgrim
Date: 2023-02-12T18:26:28Z
New Revision: 19c1682b6a4cdbe75113e155fdd711d9ded1a37f
URL: https://github.com/llvm/llvm-project/commit/19c1682b6a4cdbe75113e155fdd711d9ded1a37f
DIFF: https://github.com/llvm/llvm-project/commit/19c1682b6a4cdbe75113e155fdd711d9ded1a37f.diff
LOG: [X86] combineConcatVectorOps - concatenate 512-bit VPERMILPS nodes.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/matrix-multiply.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0ff60df06bffa..c0cd938ff4c8d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55597,11 +55597,16 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
[[fallthrough]];
case X86ISD::VPERMILPI:
- if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
- Op0.getOperand(1) == Ops[1].getOperand(1)) {
- SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
- Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
- Op0.getOperand(1));
+ if (!IsSplat && VT.getScalarSizeInBits() == 32 &&
+ (VT.is256BitVector() ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
+ all_of(Ops, [&Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ })) {
+ MVT FloatVT = VT.changeVectorElementType(MVT::f32);
+ SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
+ Res =
+ DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
return DAG.getBitcast(VT, Res);
}
if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 076f333cdfdd2..a607cb7834dc0 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -1068,32 +1068,28 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin
;
; AVX512-LABEL: test_mul4x4_f32:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,1,1,1,5,5,5,5]
-; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm6
-; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
-; AVX512-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
-; AVX512-NEXT: vinsertf64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3
+; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm4
+; AVX512-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512-NEXT: vpermilps {{.*#+}} zmm4 = zmm2[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm5 = zmm0[4,5,4,5,4,5,4,5]
+; AVX512-NEXT: vmulps %zmm4, %zmm5, %zmm4
+; AVX512-NEXT: vpermilps {{.*#+}} zmm5 = zmm2[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3,2,3,2,3]
; AVX512-NEXT: vmulps %zmm5, %zmm6, %zmm5
-; AVX512-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vinsertf64x4 $1, %ymm6, %zmm4, %zmm4
-; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm6 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512-NEXT: vmulps %zmm4, %zmm6, %zmm4
-; AVX512-NEXT: vaddps %zmm5, %zmm4, %zmm4
-; AVX512-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vinsertf64x4 $1, %ymm5, %zmm3, %zmm3
-; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm5 = zmm0[4,5,4,5,4,5,4,5]
-; AVX512-NEXT: vmulps %zmm3, %zmm5, %zmm3
-; AVX512-NEXT: vaddps %zmm3, %zmm4, %zmm3
-; AVX512-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT: vpermilps {{.*#+}} zmm1 = zmm1[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512-NEXT: vmulps %zmm1, %zmm3, %zmm1
+; AVX512-NEXT: vaddps %zmm5, %zmm1, %zmm1
+; AVX512-NEXT: vaddps %zmm4, %zmm1, %zmm1
+; AVX512-NEXT: vpermilps {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
-; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vaddps %zmm0, %zmm3, %zmm0
+; AVX512-NEXT: vmulps %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vaddps %zmm0, %zmm1, %zmm0
; AVX512-NEXT: retq
entry:
%split = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2403,339 +2399,325 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
;
; AVX512F-LABEL: test_mul8x8_f32:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: subq $520, %rsp # imm = 0x208
-; AVX512F-NEXT: vmovups %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovups %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcastss %xmm4, %ymm10
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm8 = xmm4[1,1,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm11
; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm9
; AVX512F-NEXT: vextractf64x4 $1, %zmm4, %ymm8
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm8[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm9, %zmm23
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm4[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm9, %ymm9
-; AVX512F-NEXT: vextractf32x4 $2, %zmm4, %xmm12
-; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm10, %zmm26
-; AVX512F-NEXT: vextractf128 $1, %ymm4, %xmm10
-; AVX512F-NEXT: vbroadcastss %xmm10, %ymm10
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm8[1,1,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm27
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm8[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11
-; AVX512F-NEXT: vextractf32x4 $3, %zmm4, %xmm12
-; AVX512F-NEXT: vbroadcastss %xmm12, %ymm13
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm9, %zmm0
-; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vbroadcastss %xmm5, %ymm11
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm9, %ymm14
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm13, %zmm10, %zmm0
-; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vextractf64x4 $1, %zmm5, %ymm9
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,2,2,2]
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm10 = xmm8[2,2,2,2]
; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm14, %zmm0
-; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm5[1,1,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10
-; AVX512F-NEXT: vextractf32x4 $2, %zmm5, %xmm15
-; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm15, %zmm11, %zmm0
-; AVX512F-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm10, %zmm0
-; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vextractf128 $1, %ymm5, %xmm10
-; AVX512F-NEXT: vbroadcastss %xmm10, %ymm10
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm9[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm0
-; AVX512F-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT: vextractf32x4 $3, %zmm5, %xmm11
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm9, %zmm16
+; AVX512F-NEXT: vbroadcastss %xmm4, %ymm10
+; AVX512F-NEXT: vextractf32x4 $2, %zmm4, %xmm11
; AVX512F-NEXT: vbroadcastss %xmm11, %ymm11
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm6[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm10, %zmm18
-; AVX512F-NEXT: vextractf64x4 $1, %zmm6, %ymm10
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm10[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm12, %zmm20
-; AVX512F-NEXT: vbroadcastss %xmm6, %ymm11
-; AVX512F-NEXT: vextractf32x4 $2, %zmm6, %xmm12
-; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm21
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm6[1,1,3,3]
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm10, %zmm17
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm4[1,1,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm10[1,1,3,3]
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm8[1,1,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm22
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm6[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm10[3,3,3,3]
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm18
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm4[3,3,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm24
-; AVX512F-NEXT: vextractf128 $1, %ymm6, %xmm11
-; AVX512F-NEXT: vbroadcastss %xmm11, %ymm11
-; AVX512F-NEXT: vmovshdup {{.*#+}} zmm17 = zmm4[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm19 = [6,6,6,6,6,6,6,6,22,22,22,22,22,22,22,22]
-; AVX512F-NEXT: vmovaps %zmm4, %zmm29
-; AVX512F-NEXT: vpermt2ps %zmm8, %zmm19, %zmm29
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm30 = [7,7,7,7,7,7,7,7,23,23,23,23,23,23,23,23]
-; AVX512F-NEXT: vpermt2ps %zmm8, %zmm30, %zmm4
-; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm8
-; AVX512F-NEXT: vbroadcastss %xmm8, %ymm8
-; AVX512F-NEXT: vmovshdup {{.*#+}} zmm31 = zmm5[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512F-NEXT: vmovaps %zmm5, %zmm2
-; AVX512F-NEXT: vpermt2ps %zmm9, %zmm19, %zmm2
-; AVX512F-NEXT: vpermt2ps %zmm9, %zmm30, %zmm5
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm8, %zmm11, %zmm16
-; AVX512F-NEXT: vmovshdup {{.*#+}} zmm15 = zmm6[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512F-NEXT: vmovaps %zmm6, %zmm25
-; AVX512F-NEXT: vpermt2ps %zmm10, %zmm19, %zmm25
-; AVX512F-NEXT: vpermt2ps %zmm10, %zmm30, %zmm6
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm10 = xmm7[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10
-; AVX512F-NEXT: vextractf64x4 $1, %zmm7, %ymm0
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[2,2,2,2]
-; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm10, %zmm13
-; AVX512F-NEXT: vextractf32x4 $2, %zmm7, %xmm10
-; AVX512F-NEXT: vbroadcastss %xmm10, %ymm10
-; AVX512F-NEXT: vbroadcastss %xmm7, %ymm11
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm11, %zmm12
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm7[1,1,3,3]
+; AVX512F-NEXT: vextractf128 $1, %ymm4, %xmm13
+; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[3,3,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm8, %zmm12, %zmm19
+; AVX512F-NEXT: vextractf32x4 $3, %zmm4, %xmm12
+; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm14 = xmm5[2,2,2,2]
+; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm13, %zmm20
+; AVX512F-NEXT: vextractf64x4 $1, %zmm5, %ymm13
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm15 = xmm13[2,2,2,2]
+; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm15, %zmm14, %zmm21
+; AVX512F-NEXT: vbroadcastss %xmm5, %ymm15
+; AVX512F-NEXT: vextractf32x4 $2, %zmm5, %xmm9
+; AVX512F-NEXT: vbroadcastss %xmm9, %ymm9
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm9, %zmm15, %zmm22
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm5[1,1,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm13[1,1,3,3]
; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm11 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm11, %ymm11
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm11, %zmm10, %zmm9
-; AVX512F-NEXT: vpermi2ps %zmm0, %zmm7, %zmm19
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[3,3,3,3]
-; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm10
-; AVX512F-NEXT: vextractf32x4 $3, %zmm7, %xmm11
-; AVX512F-NEXT: vmovshdup {{.*#+}} zmm8 = zmm7[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512F-NEXT: vpermt2ps %zmm0, %zmm30, %zmm7
-; AVX512F-NEXT: vbroadcastsd %xmm1, %ymm1
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX512F-NEXT: vbroadcastsd %xmm0, %ymm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm3
-; AVX512F-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm30
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512F-NEXT: vmulps %zmm26, %zmm30, %zmm1
-; AVX512F-NEXT: vmulps %zmm27, %zmm0, %zmm28
-; AVX512F-NEXT: vaddps %zmm28, %zmm1, %zmm1
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm14, %zmm14, %zmm28
-; AVX512F-NEXT: vmulps %zmm23, %zmm28, %zmm27
-; AVX512F-NEXT: vaddps %zmm27, %zmm1, %zmm1
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm27 = zmm14[4,5,6,7,4,5,6,7]
-; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm26 # 64-byte Folded Reload
-; AVX512F-NEXT: vaddps %zmm26, %zmm1, %zmm1
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm14, %zmm14, %zmm26
-; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm26, %zmm23 # 64-byte Folded Reload
-; AVX512F-NEXT: vaddps %zmm23, %zmm1, %zmm1
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm23 = zmm14[4,5,6,7,4,5,6,7]
-; AVX512F-NEXT: vpermpd {{.*#+}} zmm14 = zmm17[2,2,2,2,6,6,6,6]
-; AVX512F-NEXT: vmulps %zmm14, %zmm23, %zmm14
-; AVX512F-NEXT: vaddps %zmm14, %zmm1, %zmm17
-; AVX512F-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm14
-; AVX512F-NEXT: vmulps %zmm29, %zmm14, %zmm29
-; AVX512F-NEXT: vaddps %zmm29, %zmm17, %zmm17
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm29 = zmm1[4,5,6,7,4,5,6,7]
-; AVX512F-NEXT: vmulps %zmm4, %zmm29, %zmm4
-; AVX512F-NEXT: vaddps %zmm4, %zmm17, %zmm4
-; AVX512F-NEXT: vmulps (%rsp), %zmm30, %zmm1 # 64-byte Folded Reload
-; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 64-byte Folded Reload
-; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1
-; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm17 # 64-byte Folded Reload
-; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1
-; AVX512F-NEXT: vmulps %zmm5, %zmm29, %zmm5
-; AVX512F-NEXT: vmulps {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm17 # 64-byte Folded Reload
-; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1
-; AVX512F-NEXT: vmulps %zmm18, %zmm26, %zmm17
-; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1
-; AVX512F-NEXT: vpermpd {{.*#+}} zmm17 = zmm31[2,2,2,2,6,6,6,6]
-; AVX512F-NEXT: vmulps %zmm17, %zmm23, %zmm17
-; AVX512F-NEXT: vaddps %zmm17, %zmm1, %zmm1
-; AVX512F-NEXT: vmulps %zmm2, %zmm14, %zmm2
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm15, %zmm10
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm15 = xmm5[3,3,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512F-NEXT: vextractf128 $1, %ymm5, %xmm11
+; AVX512F-NEXT: vbroadcastss %xmm11, %ymm11
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[3,3,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm13, %zmm15, %zmm15
+; AVX512F-NEXT: vextractf32x4 $3, %zmm5, %xmm13
+; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm6[2,2,2,2]
+; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm13, %zmm11, %zmm11
+; AVX512F-NEXT: vextractf64x4 $1, %zmm6, %ymm13
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm13[2,2,2,2]
+; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm8, %zmm8
+; AVX512F-NEXT: vbroadcastss %xmm6, %ymm12
+; AVX512F-NEXT: vextractf32x4 $2, %zmm6, %xmm14
+; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm14, %zmm12, %zmm12
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm6[1,1,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm9 = xmm13[1,1,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm9, %ymm9
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm9, %zmm14, %zmm9
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm14
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm23 = zmm0[4,5,6,7,4,5,6,7]
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm24
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm25 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm26
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm27 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512F-NEXT: vmovshdup {{.*#+}} zmm0 = zmm4[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm3, %zmm3, %zmm4
+; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7]
+; AVX512F-NEXT: vmulps %zmm17, %zmm14, %zmm17
+; AVX512F-NEXT: vmulps %zmm18, %zmm23, %zmm18
+; AVX512F-NEXT: vaddps %zmm18, %zmm17, %zmm17
+; AVX512F-NEXT: vmulps %zmm16, %zmm24, %zmm16
+; AVX512F-NEXT: vaddps %zmm16, %zmm17, %zmm16
+; AVX512F-NEXT: vmulps %zmm19, %zmm25, %zmm17
+; AVX512F-NEXT: vaddps %zmm17, %zmm16, %zmm16
+; AVX512F-NEXT: vmulps %zmm20, %zmm26, %zmm17
+; AVX512F-NEXT: vaddps %zmm17, %zmm16, %zmm16
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm0, %zmm27, %zmm0
+; AVX512F-NEXT: vaddps %zmm0, %zmm16, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm1 = zmm2[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vaddps %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmulps %zmm22, %zmm14, %zmm1
+; AVX512F-NEXT: vmulps %zmm10, %zmm23, %zmm2
; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vaddps %zmm5, %zmm1, %zmm1
-; AVX512F-NEXT: vmulps %zmm21, %zmm30, %zmm2
-; AVX512F-NEXT: vmulps %zmm22, %zmm0, %zmm5
-; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmulps %zmm20, %zmm28, %zmm5
+; AVX512F-NEXT: vmulps %zmm21, %zmm24, %zmm2
+; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vmulps %zmm15, %zmm25, %zmm2
+; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vmulps %zmm11, %zmm26, %zmm2
+; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vmovshdup {{.*#+}} zmm2 = zmm5[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm2, %zmm27, %zmm2
+; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm2 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm2, %zmm4, %zmm2
+; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm2 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vmulps %zmm12, %zmm14, %zmm2
+; AVX512F-NEXT: vmulps %zmm9, %zmm23, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmulps %zmm24, %zmm27, %zmm5
+; AVX512F-NEXT: vmulps %zmm8, %zmm24, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmulps %zmm16, %zmm26, %zmm5
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm5, %ymm5
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm13[3,3,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm8, %zmm5, %zmm5
+; AVX512F-NEXT: vextractf128 $1, %ymm6, %xmm8
+; AVX512F-NEXT: vbroadcastss %xmm8, %ymm8
+; AVX512F-NEXT: vmulps %zmm5, %zmm25, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm15[2,2,2,2,6,6,6,6]
-; AVX512F-NEXT: vmulps %zmm5, %zmm23, %zmm5
+; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm5
+; AVX512F-NEXT: vbroadcastss %xmm5, %ymm5
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm8, %zmm5
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm7[2,2,2,2]
+; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
+; AVX512F-NEXT: vmulps %zmm5, %zmm26, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmulps %zmm25, %zmm14, %zmm5
+; AVX512F-NEXT: vmovshdup {{.*#+}} zmm5 = zmm6[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm5, %zmm27, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmulps %zmm6, %zmm29, %zmm5
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm5 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm5, %zmm4, %zmm5
; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2
-; AVX512F-NEXT: vmulps %zmm13, %zmm28, %zmm5
-; AVX512F-NEXT: vmulps %zmm12, %zmm30, %zmm6
-; AVX512F-NEXT: vmulps %zmm9, %zmm0, %zmm0
-; AVX512F-NEXT: vaddps %zmm0, %zmm6, %zmm0
-; AVX512F-NEXT: vaddps %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vmulps %zmm3, %zmm27, %zmm3
-; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT: vbroadcastss %xmm10, %ymm3
-; AVX512F-NEXT: vbroadcastss %xmm11, %ymm5
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm3, %zmm3
-; AVX512F-NEXT: vmulps %zmm3, %zmm26, %zmm3
-; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT: vpermpd {{.*#+}} zmm3 = zmm8[2,2,2,2,6,6,6,6]
-; AVX512F-NEXT: vmulps %zmm3, %zmm23, %zmm3
-; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT: vmulps %zmm19, %zmm14, %zmm3
-; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm0
-; AVX512F-NEXT: vmulps %zmm7, %zmm29, %zmm3
-; AVX512F-NEXT: vaddps %zmm3, %zmm0, %zmm3
-; AVX512F-NEXT: vmovaps %zmm4, %zmm0
-; AVX512F-NEXT: addq $520, %rsp # imm = 0x208
+; AVX512F-NEXT: vextractf64x4 $1, %zmm7, %ymm5
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm6, %zmm3, %zmm6
+; AVX512F-NEXT: vaddps %zmm6, %zmm2, %zmm2
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2]
+; AVX512F-NEXT: vbroadcastsd %xmm6, %ymm6
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm6, %zmm8, %zmm6
+; AVX512F-NEXT: vbroadcastss %xmm7, %ymm8
+; AVX512F-NEXT: vmulps %zmm6, %zmm24, %zmm6
+; AVX512F-NEXT: vextractf32x4 $2, %zmm7, %xmm9
+; AVX512F-NEXT: vbroadcastss %xmm9, %ymm9
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm9 = xmm7[1,1,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm9, %ymm9
+; AVX512F-NEXT: vmulps %zmm8, %zmm14, %zmm8
+; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm5[1,1,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm9, %zmm9
+; AVX512F-NEXT: vmulps %zmm9, %zmm23, %zmm9
+; AVX512F-NEXT: vaddps %zmm9, %zmm8, %zmm8
+; AVX512F-NEXT: vaddps %zmm6, %zmm8, %zmm6
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm7[3,3,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8
+; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,3,3,3]
+; AVX512F-NEXT: vbroadcastsd %xmm5, %ymm5
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm8, %zmm5
+; AVX512F-NEXT: vmulps %zmm5, %zmm25, %zmm5
+; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX512F-NEXT: vbroadcastss %xmm8, %ymm8
+; AVX512F-NEXT: vaddps %zmm5, %zmm6, %zmm5
+; AVX512F-NEXT: vextractf32x4 $3, %zmm7, %xmm6
+; AVX512F-NEXT: vbroadcastss %xmm6, %ymm6
+; AVX512F-NEXT: vinsertf64x4 $1, %ymm6, %zmm8, %zmm6
+; AVX512F-NEXT: vmulps %zmm6, %zmm26, %zmm6
+; AVX512F-NEXT: vaddps %zmm6, %zmm5, %zmm5
+; AVX512F-NEXT: vmovshdup {{.*#+}} zmm6 = zmm7[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm6, %zmm27, %zmm6
+; AVX512F-NEXT: vaddps %zmm6, %zmm5, %zmm5
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm6 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm6, %zmm4, %zmm4
+; AVX512F-NEXT: vaddps %zmm4, %zmm5, %zmm4
+; AVX512F-NEXT: vpermilps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
+; AVX512F-NEXT: vmulps %zmm5, %zmm3, %zmm3
+; AVX512F-NEXT: vaddps %zmm3, %zmm4, %zmm3
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: test_mul8x8_f32:
; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm12
+; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm11
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm4[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm11
+; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm10
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,2,2,2]
-; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm15
+; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm14
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm9
; AVX512VL-NEXT: vextractf128 $1, %ymm4, %xmm8
-; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm10
-; AVX512VL-NEXT: vextractf32x4 $2, %zmm4, %xmm8
-; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm17
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm4, %ymm8
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm8[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm14
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm8[2,2,2,2]
-; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm18
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm8[3,3,3,3]
+; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm8
+; AVX512VL-NEXT: vextractf32x4 $2, %zmm4, %xmm12
+; AVX512VL-NEXT: vbroadcastss %xmm12, %ymm15
+; AVX512VL-NEXT: vextractf64x4 $1, %zmm4, %ymm12
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm12[1,1,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm16 = xmm12[2,2,2,2]
+; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[3,3,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm12, %ymm12
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm16, %zmm14, %zmm16
+; AVX512VL-NEXT: vextractf32x4 $3, %zmm4, %xmm14
+; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm17
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm15, %zmm11, %zmm14
+; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm15
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm13, %zmm10, %zmm13
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm11
+; AVX512VL-NEXT: vmulps %zmm14, %zmm11, %zmm14
+; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm10 = zmm0[4,5,6,7,4,5,6,7]
+; AVX512VL-NEXT: vmulps %zmm13, %zmm10, %zmm0
+; AVX512VL-NEXT: vaddps %zmm0, %zmm14, %zmm0
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm14
+; AVX512VL-NEXT: vmulps %zmm16, %zmm14, %zmm13
+; AVX512VL-NEXT: vaddps %zmm13, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm16
-; AVX512VL-NEXT: vextractf32x4 $3, %zmm4, %xmm13
-; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm18, %zmm15, %zmm18
-; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm19
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm17, %zmm12, %zmm12
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm5[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm20
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm14, %zmm11, %zmm11
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm15
-; AVX512VL-NEXT: vmulps %zmm12, %zmm15, %zmm12
-; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm14 = zmm0[4,5,6,7,4,5,6,7]
-; AVX512VL-NEXT: vmulps %zmm11, %zmm14, %zmm0
-; AVX512VL-NEXT: vaddps %zmm0, %zmm12, %zmm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm17
-; AVX512VL-NEXT: vmulps %zmm18, %zmm17, %zmm11
-; AVX512VL-NEXT: vaddps %zmm11, %zmm0, %zmm0
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,2,2,2]
-; AVX512VL-NEXT: vbroadcastsd %xmm11, %ymm18
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm16, %zmm9, %zmm11
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm12, %zmm9, %zmm12
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[2,2,2,2]
+; AVX512VL-NEXT: vbroadcastsd %xmm9, %ymm18
; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm9 = zmm1[4,5,6,7,4,5,6,7]
-; AVX512VL-NEXT: vmulps %zmm11, %zmm9, %zmm1
+; AVX512VL-NEXT: vmulps %zmm12, %zmm9, %zmm1
; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm13, %zmm10, %zmm1
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm10
-; AVX512VL-NEXT: vmulps %zmm1, %zmm10, %zmm1
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm17, %zmm8, %zmm1
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm8
+; AVX512VL-NEXT: vmulps %zmm1, %zmm8, %zmm1
; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0
; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm1 = zmm4[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[2,2,2,2,6,6,6,6]
-; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm11 = zmm2[4,5,6,7,4,5,6,7]
-; AVX512VL-NEXT: vmulps %zmm1, %zmm11, %zmm1
+; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm12 = zmm2[4,5,6,7,4,5,6,7]
+; AVX512VL-NEXT: vmulps %zmm1, %zmm12, %zmm1
; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vmovaps {{.*#+}} zmm12 = [6,6,6,6,6,6,6,6,22,22,22,22,22,22,22,22]
-; AVX512VL-NEXT: vmovaps %zmm4, %zmm1
-; AVX512VL-NEXT: vpermt2ps %zmm8, %zmm12, %zmm1
+; AVX512VL-NEXT: vpermilps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm3, %zmm3, %zmm13
; AVX512VL-NEXT: vmulps %zmm1, %zmm13, %zmm1
; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vmovaps {{.*#+}} zmm16 = [7,7,7,7,7,7,7,7,23,23,23,23,23,23,23,23]
-; AVX512VL-NEXT: vpermt2ps %zmm8, %zmm16, %zmm4
; AVX512VL-NEXT: vextractf64x4 $1, %zmm5, %ymm1
+; AVX512VL-NEXT: vpermilps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7]
-; AVX512VL-NEXT: vmulps %zmm4, %zmm3, %zmm2
+; AVX512VL-NEXT: vmulps %zmm2, %zmm3, %zmm2
; AVX512VL-NEXT: vaddps %zmm2, %zmm0, %zmm0
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm2
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm18, %zmm2
; AVX512VL-NEXT: vextractf32x4 $2, %zmm5, %xmm4
; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm4
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm19, %zmm4
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm8
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm8, %zmm20, %zmm8
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm18 = xmm5[3,3,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm18, %ymm18
-; AVX512VL-NEXT: vmulps %zmm4, %zmm15, %zmm4
-; AVX512VL-NEXT: vmulps %zmm8, %zmm14, %zmm8
-; AVX512VL-NEXT: vaddps %zmm8, %zmm4, %zmm4
-; AVX512VL-NEXT: vextractf128 $1, %ymm5, %xmm8
-; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm8
-; AVX512VL-NEXT: vmulps %zmm2, %zmm17, %zmm2
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm1[1,1,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm16 = xmm5[3,3,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
+; AVX512VL-NEXT: vmulps %zmm4, %zmm11, %zmm4
+; AVX512VL-NEXT: vmulps %zmm15, %zmm10, %zmm15
+; AVX512VL-NEXT: vaddps %zmm15, %zmm4, %zmm4
+; AVX512VL-NEXT: vextractf128 $1, %ymm5, %xmm15
+; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1
+; AVX512VL-NEXT: vmulps %zmm2, %zmm14, %zmm2
; AVX512VL-NEXT: vaddps %zmm2, %zmm4, %zmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[3,3,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm18, %zmm4
-; AVX512VL-NEXT: vextractf32x4 $3, %zmm5, %xmm18
-; AVX512VL-NEXT: vbroadcastss %xmm18, %ymm18
-; AVX512VL-NEXT: vmulps %zmm4, %zmm9, %zmm4
-; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2
-; AVX512VL-NEXT: vbroadcastss %xmm6, %ymm4
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm18, %zmm8, %zmm8
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm18 = xmm6[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm18, %ymm18
-; AVX512VL-NEXT: vmulps %zmm8, %zmm10, %zmm8
-; AVX512VL-NEXT: vaddps %zmm8, %zmm2, %zmm2
-; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm8 = zmm5[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512VL-NEXT: vpermpd {{.*#+}} zmm8 = zmm8[2,2,2,2,6,6,6,6]
-; AVX512VL-NEXT: vmulps %zmm8, %zmm11, %zmm8
-; AVX512VL-NEXT: vaddps %zmm8, %zmm2, %zmm2
-; AVX512VL-NEXT: vmovaps %zmm5, %zmm8
-; AVX512VL-NEXT: vpermt2ps %zmm1, %zmm12, %zmm8
-; AVX512VL-NEXT: vmulps %zmm8, %zmm13, %zmm8
-; AVX512VL-NEXT: vaddps %zmm8, %zmm2, %zmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm6[2,2,2,2]
-; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm8
-; AVX512VL-NEXT: vpermt2ps %zmm1, %zmm16, %zmm5
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm6, %ymm19
-; AVX512VL-NEXT: vmulps %zmm5, %zmm3, %zmm1
+; AVX512VL-NEXT: vextractf32x4 $3, %zmm5, %xmm4
+; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm4
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm1, %zmm16, %zmm1
+; AVX512VL-NEXT: vbroadcastss %xmm6, %ymm16
+; AVX512VL-NEXT: vmulps %zmm1, %zmm9, %zmm1
; AVX512VL-NEXT: vaddps %zmm1, %zmm2, %zmm1
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm19[2,2,2,2]
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm6[1,1,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm2
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm8, %zmm2
-; AVX512VL-NEXT: vextractf32x4 $2, %zmm6, %xmm5
-; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm5
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm19[1,1,3,3]
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm15 = xmm6[2,2,2,2]
+; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
+; AVX512VL-NEXT: vmulps %zmm4, %zmm8, %zmm4
+; AVX512VL-NEXT: vaddps %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm4 = zmm5[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6]
+; AVX512VL-NEXT: vmulps %zmm4, %zmm12, %zmm4
+; AVX512VL-NEXT: vaddps %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vpermilps {{.*#+}} zmm4 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6]
+; AVX512VL-NEXT: vmulps %zmm4, %zmm13, %zmm4
+; AVX512VL-NEXT: vaddps %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: vextractf64x4 $1, %zmm6, %ymm4
+; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
+; AVX512VL-NEXT: vmulps %zmm5, %zmm3, %zmm5
+; AVX512VL-NEXT: vaddps %zmm5, %zmm1, %zmm1
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm18, %zmm5
-; AVX512VL-NEXT: vmulps %zmm4, %zmm15, %zmm4
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm15, %zmm5
+; AVX512VL-NEXT: vextractf32x4 $2, %zmm6, %xmm15
+; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm15, %zmm16, %zmm15
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm4[1,1,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm16, %zmm2, %zmm2
+; AVX512VL-NEXT: vmulps %zmm15, %zmm11, %zmm15
+; AVX512VL-NEXT: vmulps %zmm2, %zmm10, %zmm2
+; AVX512VL-NEXT: vaddps %zmm2, %zmm15, %zmm2
; AVX512VL-NEXT: vmulps %zmm5, %zmm14, %zmm5
-; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4
+; AVX512VL-NEXT: vaddps %zmm5, %zmm2, %zmm2
; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5
-; AVX512VL-NEXT: vmulps %zmm2, %zmm17, %zmm2
-; AVX512VL-NEXT: vaddps %zmm2, %zmm4, %zmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm19[3,3,3,3]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,3,3,3]
; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm5, %zmm4
; AVX512VL-NEXT: vextractf128 $1, %ymm6, %xmm5
@@ -2745,62 +2727,65 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin
; AVX512VL-NEXT: vextractf32x4 $3, %zmm6, %xmm4
; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm4
; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm5, %zmm4
-; AVX512VL-NEXT: vmulps %zmm4, %zmm10, %zmm4
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm7[2,2,2,2]
+; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5
+; AVX512VL-NEXT: vmulps %zmm4, %zmm8, %zmm4
; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2
; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm4 = zmm6[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6]
-; AVX512VL-NEXT: vmulps %zmm4, %zmm11, %zmm4
+; AVX512VL-NEXT: vmulps %zmm4, %zmm12, %zmm4
; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2
-; AVX512VL-NEXT: vmovaps %zmm6, %zmm4
-; AVX512VL-NEXT: vpermt2ps %zmm19, %zmm12, %zmm4
+; AVX512VL-NEXT: vpermilps {{.*#+}} zmm4 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm4, %zmm13, %zmm4
; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm7[2,2,2,2]
-; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4
-; AVX512VL-NEXT: vpermt2ps %zmm19, %zmm16, %zmm6
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm7, %ymm5
+; AVX512VL-NEXT: vextractf64x4 $1, %zmm7, %ymm4
+; AVX512VL-NEXT: vpermilps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6]
; AVX512VL-NEXT: vmulps %zmm6, %zmm3, %zmm6
; AVX512VL-NEXT: vaddps %zmm6, %zmm2, %zmm2
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2]
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm6 = xmm4[2,2,2,2]
; AVX512VL-NEXT: vbroadcastsd %xmm6, %ymm6
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm6, %zmm5, %zmm5
; AVX512VL-NEXT: vbroadcastss %xmm7, %ymm6
-; AVX512VL-NEXT: vmulps %zmm4, %zmm17, %zmm4
-; AVX512VL-NEXT: vextractf32x4 $2, %zmm7, %xmm8
-; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm8
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm8, %zmm6, %zmm6
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm8
-; AVX512VL-NEXT: vmulps %zmm6, %zmm15, %zmm6
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm5[1,1,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm15, %zmm8, %zmm8
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm15 = xmm7[3,3,3,3]
-; AVX512VL-NEXT: vmulps %zmm8, %zmm14, %zmm8
-; AVX512VL-NEXT: vextractf128 $1, %ymm7, %xmm14
-; AVX512VL-NEXT: vaddps %zmm8, %zmm6, %zmm6
-; AVX512VL-NEXT: vextractf32x4 $3, %zmm7, %xmm8
-; AVX512VL-NEXT: vaddps %zmm4, %zmm6, %zmm4
-; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm6 = zmm7[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
-; AVX512VL-NEXT: vpermi2ps %zmm5, %zmm7, %zmm12
-; AVX512VL-NEXT: vpermt2ps %zmm5, %zmm16, %zmm7
-; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,3,3,3]
-; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm15, %zmm5
-; AVX512VL-NEXT: vmulps %zmm5, %zmm9, %zmm5
-; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm5
-; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm8
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm8, %zmm5, %zmm5
-; AVX512VL-NEXT: vmulps %zmm5, %zmm10, %zmm5
+; AVX512VL-NEXT: vmulps %zmm5, %zmm14, %zmm5
+; AVX512VL-NEXT: vextractf32x4 $2, %zmm7, %xmm14
+; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm14, %zmm6, %zmm6
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm14 = xmm7[1,1,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14
+; AVX512VL-NEXT: vmulps %zmm6, %zmm11, %zmm6
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm11 = xmm4[1,1,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm11, %ymm11
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm11, %zmm14, %zmm11
+; AVX512VL-NEXT: vmulps %zmm11, %zmm10, %zmm10
+; AVX512VL-NEXT: vaddps %zmm10, %zmm6, %zmm6
+; AVX512VL-NEXT: vaddps %zmm5, %zmm6, %zmm5
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,3,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm6, %ymm6
+; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,3,3,3]
+; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vmulps %zmm4, %zmm9, %zmm4
+; AVX512VL-NEXT: vextractf128 $1, %ymm7, %xmm6
+; AVX512VL-NEXT: vbroadcastss %xmm6, %ymm6
+; AVX512VL-NEXT: vaddps %zmm4, %zmm5, %zmm4
+; AVX512VL-NEXT: vextractf32x4 $3, %zmm7, %xmm5
+; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm5
+; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm6, %zmm5
+; AVX512VL-NEXT: vmulps %zmm5, %zmm8, %zmm5
; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm6[2,2,2,2,6,6,6,6]
-; AVX512VL-NEXT: vmulps %zmm5, %zmm11, %zmm5
+; AVX512VL-NEXT: vmovshdup {{.*#+}} zmm5 = zmm7[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
+; AVX512VL-NEXT: vmulps %zmm5, %zmm12, %zmm5
; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vmulps %zmm12, %zmm13, %zmm5
+; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
+; AVX512VL-NEXT: vmulps %zmm5, %zmm13, %zmm5
; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4
-; AVX512VL-NEXT: vmulps %zmm7, %zmm3, %zmm3
+; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6]
+; AVX512VL-NEXT: vmulps %zmm5, %zmm3, %zmm3
; AVX512VL-NEXT: vaddps %zmm3, %zmm4, %zmm3
; AVX512VL-NEXT: retq
entry:
More information about the llvm-commits
mailing list