[llvm] 73e14de - [X86] combineConcatVectorOps - recursively call combineConcatVectorOps instead of predicting when ops will freely concat (#130275)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Mar 8 03:49:59 PST 2025
Author: Simon Pilgrim
Date: 2025-03-08T11:49:55Z
New Revision: 73e14de207a3aa0fa071fa56756e8e316edf5227
URL: https://github.com/llvm/llvm-project/commit/73e14de207a3aa0fa071fa56756e8e316edf5227
DIFF: https://github.com/llvm/llvm-project/commit/73e14de207a3aa0fa071fa56756e8e316edf5227.diff
LOG: [X86] combineConcatVectorOps - recursively call combineConcatVectorOps instead of predicting when ops will freely concat (#130275)
The IsConcatFree helper is limited to estimates on where concatenating the subvector operands is beneficial, this patch replaces FADD/FSUB/FMUL concatenation checks with a recursive call to combineConcatVectorOps to see if it will profitably concatenate further up the chain.
Other opcodes can be moved to using the CombineSubOperand helper in future patches.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/matrix-multiply.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 57a4c6f7a4869..0602f50ed1603 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41903,7 +41903,8 @@ static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ArrayRef<SDValue> Ops, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget);
+ const X86Subtarget &Subtarget,
+ unsigned Depth = 0);
/// Try to combine x86 target specific shuffles.
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
@@ -57791,7 +57792,8 @@ CastIntSETCCtoFP(MVT VT, ISD::CondCode CC, unsigned NumSignificantBitsLHS,
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
ArrayRef<SDValue> Ops, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+ const X86Subtarget &Subtarget,
+ unsigned Depth) {
assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
unsigned EltSizeInBits = VT.getScalarSizeInBits();
@@ -57803,6 +57805,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}))
return getZeroVector(VT, Subtarget, DAG, DL);
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue(); // Limit search depth.
+
SDValue Op0 = Ops[0];
bool IsSplat = llvm::all_equal(Ops);
unsigned NumOps = Ops.size();
@@ -57933,6 +57938,20 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
}
return AllConstants || AllSubs;
};
+ auto CombineSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
+ bool AllConstants = true;
+ SmallVector<SDValue> Subs;
+ for (SDValue SubOp : SubOps) {
+ SDValue BC = peekThroughBitcasts(SubOp.getOperand(I));
+ AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
+ ISD::isBuildVectorOfConstantFPSDNodes(BC.getNode());
+ Subs.push_back(SubOp.getOperand(I));
+ }
+ if (AllConstants)
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
+ return combineConcatVectorOps(DL, VT, Subs, DAG, DCI, Subtarget,
+ Depth + 1);
+ };
switch (Op0.getOpcode()) {
case ISD::VECTOR_SHUFFLE: {
@@ -58354,14 +58373,17 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
- if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
- (VT.is256BitVector() ||
- (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
- return DAG.getNode(Op0.getOpcode(), DL, VT,
- ConcatSubOperand(VT, Ops, 0),
- ConcatSubOperand(VT, Ops, 1));
+ if (!IsSplat && (VT.is256BitVector() ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
+ SDValue Concat0 = CombineSubOperand(VT, Ops, 0);
+ SDValue Concat1 = CombineSubOperand(VT, Ops, 1);
+ if (Concat0 || Concat1)
+ return DAG.getNode(Op0.getOpcode(), DL, VT,
+ Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0),
+ Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1));
}
break;
+ // Always prefer to concatenate high latency FDIV instructions.
case ISD::FDIV:
if (!IsSplat && (VT.is256BitVector() ||
(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 7a5819c2978ae..1ee03c5f1223f 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1OR2,AVX1
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX1OR2,AVX2
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
;
; Basic matrix multiply tests based on the pattern:
@@ -117,22 +117,38 @@ define <4 x double> @test_mul2x2_f64(<4 x double> %a0, <4 x double> %a1) nounwin
; SSE-NEXT: movapd %xmm4, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_mul2x2_f64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vmovddup {{.*#+}} xmm3 = xmm1[0,0]
-; AVX-NEXT: vmulpd %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,1]
-; AVX-NEXT: vmulpd %xmm4, %xmm2, %xmm4
-; AVX-NEXT: vaddpd %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0]
-; AVX-NEXT: vmulpd %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,1]
-; AVX-NEXT: vmulpd %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: retq
+; AVX1-LABEL: test_mul2x2_f64:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX1-NEXT: vmulpd %ymm2, %ymm3, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX1-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: test_mul2x2_f64:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX2-NEXT: vmulpd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_mul2x2_f64:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[1,1,3,3]
+; AVX512-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3]
+; AVX512-NEXT: vmulpd %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vmovddup {{.*#+}} ymm1 = ymm1[0,0,2,2]
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512-NEXT: vmulpd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: retq
entry:
%split = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 0, i32 1>
%split1 = shufflevector <4 x double> %a0, <4 x double> poison, <2 x i32> <i32 2, i32 3>
@@ -958,227 +974,58 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin
; SSE-NEXT: movaps %xmm5, %xmm2
; SSE-NEXT: retq
;
-; AVX1-LABEL: test_mul4x4_f32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} xmm6 = xmm2[0,0,0,0]
-; AVX1-NEXT: vmulps %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
-; AVX1-NEXT: vmulps %xmm7, %xmm5, %xmm7
-; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
-; AVX1-NEXT: vmulps %xmm7, %xmm1, %xmm7
-; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
-; AVX1-NEXT: vmulps %xmm7, %xmm4, %xmm7
-; AVX1-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,0,0,0]
-; AVX1-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
-; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
-; AVX1-NEXT: vmulps %xmm1, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX1-NEXT: vmulps %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vaddps %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm7 = xmm3[0,0,0,0]
-; AVX1-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
-; AVX1-NEXT: vmulps %xmm1, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
-; AVX1-NEXT: vmulps %xmm4, %xmm8, %xmm8
-; AVX1-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[0,0,0,0]
-; AVX1-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX1-NEXT: vmulps %xmm5, %xmm8, %xmm5
-; AVX1-NEXT: vaddps %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
-; AVX1-NEXT: vmulps %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
-; AVX1-NEXT: vmulps %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_mul4x4_f32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vbroadcastss %xmm2, %xmm6
-; AVX2-NEXT: vmulps %xmm6, %xmm0, %xmm6
-; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[1,1,1,1]
-; AVX2-NEXT: vmulps %xmm7, %xmm5, %xmm7
-; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,2,2,2]
-; AVX2-NEXT: vmulps %xmm7, %xmm1, %xmm7
-; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm2[3,3,3,3]
-; AVX2-NEXT: vmulps %xmm7, %xmm4, %xmm7
-; AVX2-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX2-NEXT: vbroadcastss %xmm2, %xmm7
-; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm2[1,1,1,1]
-; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,2,2,2]
-; AVX2-NEXT: vmulps %xmm1, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
-; AVX2-NEXT: vmulps %xmm2, %xmm4, %xmm2
-; AVX2-NEXT: vaddps %xmm2, %xmm7, %xmm2
-; AVX2-NEXT: vbroadcastss %xmm3, %xmm7
-; AVX2-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2]
-; AVX2-NEXT: vmulps %xmm1, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[3,3,3,3]
-; AVX2-NEXT: vmulps %xmm4, %xmm8, %xmm8
-; AVX2-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX2-NEXT: vbroadcastss %xmm3, %xmm8
-; AVX2-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1]
-; AVX2-NEXT: vmulps %xmm5, %xmm8, %xmm5
-; AVX2-NEXT: vaddps %xmm5, %xmm0, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2]
-; AVX2-NEXT: vmulps %xmm5, %xmm1, %xmm1
-; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3]
-; AVX2-NEXT: vmulps %xmm1, %xmm4, %xmm1
-; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm0
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: test_mul4x4_f32:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX512F-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; AVX512F-NEXT: vextractf32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT: vbroadcastss %xmm1, %xmm5
-; AVX512F-NEXT: vmulps %xmm5, %xmm0, %xmm5
-; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
-; AVX512F-NEXT: vmulps %xmm6, %xmm4, %xmm6
-; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
-; AVX512F-NEXT: vmulps %xmm6, %xmm3, %xmm6
-; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
-; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm6
-; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX512F-NEXT: vbroadcastss %xmm6, %xmm7
-; AVX512F-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
-; AVX512F-NEXT: vmulps %xmm4, %xmm8, %xmm8
-; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
-; AVX512F-NEXT: vmulps %xmm3, %xmm8, %xmm8
-; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
-; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm6
-; AVX512F-NEXT: vaddps %xmm6, %xmm7, %xmm6
-; AVX512F-NEXT: vextractf32x4 $2, %zmm1, %xmm7
-; AVX512F-NEXT: vbroadcastss %xmm7, %xmm8
-; AVX512F-NEXT: vmulps %xmm0, %xmm8, %xmm8
-; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
-; AVX512F-NEXT: vmulps %xmm4, %xmm9, %xmm9
-; AVX512F-NEXT: vaddps %xmm9, %xmm8, %xmm8
-; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
-; AVX512F-NEXT: vmulps %xmm3, %xmm9, %xmm9
-; AVX512F-NEXT: vaddps %xmm9, %xmm8, %xmm8
-; AVX512F-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX512F-NEXT: vmulps %xmm7, %xmm2, %xmm7
-; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT: vextractf32x4 $3, %zmm1, %xmm1
-; AVX512F-NEXT: vbroadcastss %xmm1, %xmm8
-; AVX512F-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
-; AVX512F-NEXT: vmulps %xmm4, %xmm8, %xmm4
-; AVX512F-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2]
-; AVX512F-NEXT: vmulps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vaddps %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX512F-NEXT: vmulps %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX512F-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm1
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: retq
+; AVX1OR2-LABEL: test_mul4x4_f32:
+; AVX1OR2: # %bb.0: # %entry
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5]
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3]
+; AVX1OR2-NEXT: vmulps %ymm4, %ymm5, %ymm4
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0,0,0,4,4,4,4]
+; AVX1OR2-NEXT: vmulps %ymm0, %ymm6, %ymm0
+; AVX1OR2-NEXT: vaddps %ymm4, %ymm0, %ymm0
+; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6]
+; AVX1OR2-NEXT: vmulps %ymm7, %ymm4, %ymm7
+; AVX1OR2-NEXT: vaddps %ymm7, %ymm0, %ymm0
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX1OR2-NEXT: vmulps %ymm2, %ymm1, %ymm2
+; AVX1OR2-NEXT: vaddps %ymm2, %ymm0, %ymm0
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
+; AVX1OR2-NEXT: vmulps %ymm2, %ymm5, %ymm2
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4]
+; AVX1OR2-NEXT: vmulps %ymm5, %ymm6, %ymm5
+; AVX1OR2-NEXT: vaddps %ymm2, %ymm5, %ymm2
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6]
+; AVX1OR2-NEXT: vmulps %ymm5, %ymm4, %ymm4
+; AVX1OR2-NEXT: vaddps %ymm4, %ymm2, %ymm2
+; AVX1OR2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7]
+; AVX1OR2-NEXT: vmulps %ymm3, %ymm1, %ymm1
+; AVX1OR2-NEXT: vaddps %ymm1, %ymm2, %ymm1
+; AVX1OR2-NEXT: retq
;
-; AVX512VL-LABEL: test_mul4x4_f32:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm4
-; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm5
-; AVX512VL-NEXT: vmulps %xmm5, %xmm0, %xmm5
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[1,1,1,1]
-; AVX512VL-NEXT: vmulps %xmm6, %xmm2, %xmm6
-; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[2,2,2,2]
-; AVX512VL-NEXT: vmulps %xmm6, %xmm3, %xmm6
-; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,3,3,3]
-; AVX512VL-NEXT: vmulps %xmm6, %xmm4, %xmm6
-; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX512VL-NEXT: vbroadcastss %xmm6, %xmm7
-; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm7
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm6[1,1,1,1]
-; AVX512VL-NEXT: vmulps %xmm2, %xmm8, %xmm8
-; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2]
-; AVX512VL-NEXT: vmulps %xmm3, %xmm8, %xmm8
-; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3]
-; AVX512VL-NEXT: vmulps %xmm6, %xmm4, %xmm6
-; AVX512VL-NEXT: vaddps %xmm6, %xmm7, %xmm6
-; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm7
-; AVX512VL-NEXT: vbroadcastss %xmm7, %xmm8
-; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm8
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm7[1,1,1,1]
-; AVX512VL-NEXT: vmulps %xmm2, %xmm9, %xmm9
-; AVX512VL-NEXT: vaddps %xmm9, %xmm8, %xmm8
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm7[2,2,2,2]
-; AVX512VL-NEXT: vmulps %xmm3, %xmm9, %xmm9
-; AVX512VL-NEXT: vaddps %xmm9, %xmm8, %xmm8
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX512VL-NEXT: vmulps %xmm7, %xmm4, %xmm7
-; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vextractf32x4 $3, %zmm1, %xmm1
-; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm8
-; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1]
-; AVX512VL-NEXT: vmulps %xmm2, %xmm8, %xmm2
-; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,2,2,2]
-; AVX512VL-NEXT: vmulps %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
-; AVX512VL-NEXT: vmulps %xmm1, %xmm4, %xmm1
-; AVX512VL-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
-; AVX512VL-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm1
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_mul4x4_f32:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
+; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm3
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vshufps {{.*#+}} zmm2 = zmm1[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13]
+; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[2,3,2,3,2,3,2,3]
+; AVX512-NEXT: vmulps %zmm2, %zmm3, %zmm2
+; AVX512-NEXT: vshufps {{.*#+}} zmm3 = zmm1[0,0,0,0,4,4,4,4,8,8,8,8,12,12,12,12]
+; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm4 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512-NEXT: vmulps %zmm3, %zmm4, %zmm3
+; AVX512-NEXT: vaddps %zmm2, %zmm3, %zmm2
+; AVX512-NEXT: vshufps {{.*#+}} zmm3 = zmm1[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14]
+; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm4 = zmm0[4,5,4,5,4,5,4,5]
+; AVX512-NEXT: vmulps %zmm3, %zmm4, %zmm3
+; AVX512-NEXT: vaddps %zmm3, %zmm2, %zmm2
+; AVX512-NEXT: vshufps {{.*#+}} zmm1 = zmm1[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15]
+; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7]
+; AVX512-NEXT: vmulps %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vaddps %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: retq
entry:
%split = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%split1 = shufflevector <16 x float> %a0, <16 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -1472,113 +1319,42 @@ define <16 x double> @test_mul4x4_f64(<16 x double> %a0, <16 x double> %a1) noun
; AVX2-NEXT: vmovapd %ymm6, %ymm2
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: test_mul4x4_f64:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm5
-; AVX512F-NEXT: vextractf64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT: vbroadcastsd %xmm2, %ymm6
-; AVX512F-NEXT: vmulpd %ymm6, %ymm0, %ymm6
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1]
-; AVX512F-NEXT: vmulpd %ymm7, %ymm5, %ymm7
-; AVX512F-NEXT: vaddpd %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2]
-; AVX512F-NEXT: vmulpd %ymm7, %ymm1, %ymm7
-; AVX512F-NEXT: vaddpd %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3]
-; AVX512F-NEXT: vmulpd %ymm7, %ymm4, %ymm7
-; AVX512F-NEXT: vaddpd %ymm7, %ymm6, %ymm6
-; AVX512F-NEXT: vextractf64x4 $1, %zmm2, %ymm2
-; AVX512F-NEXT: vbroadcastsd %xmm2, %ymm7
-; AVX512F-NEXT: vmulpd %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1]
-; AVX512F-NEXT: vmulpd %ymm5, %ymm8, %ymm8
-; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2]
-; AVX512F-NEXT: vmulpd %ymm1, %ymm8, %ymm8
-; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3]
-; AVX512F-NEXT: vmulpd %ymm2, %ymm4, %ymm2
-; AVX512F-NEXT: vaddpd %ymm2, %ymm7, %ymm2
-; AVX512F-NEXT: vbroadcastsd %xmm3, %ymm7
-; AVX512F-NEXT: vmulpd %ymm7, %ymm0, %ymm7
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
-; AVX512F-NEXT: vmulpd %ymm5, %ymm8, %ymm8
-; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2]
-; AVX512F-NEXT: vmulpd %ymm1, %ymm8, %ymm8
-; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3]
-; AVX512F-NEXT: vmulpd %ymm4, %ymm8, %ymm8
-; AVX512F-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512F-NEXT: vextractf64x4 $1, %zmm3, %ymm3
-; AVX512F-NEXT: vbroadcastsd %xmm3, %ymm8
-; AVX512F-NEXT: vmulpd %ymm0, %ymm8, %ymm0
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
-; AVX512F-NEXT: vmulpd %ymm5, %ymm8, %ymm5
-; AVX512F-NEXT: vaddpd %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm5 = ymm3[2,2,2,2]
-; AVX512F-NEXT: vmulpd %ymm5, %ymm1, %ymm1
-; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3]
-; AVX512F-NEXT: vmulpd %ymm1, %ymm4, %ymm1
-; AVX512F-NEXT: vaddpd %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm6, %zmm0
-; AVX512F-NEXT: vinsertf64x4 $1, %ymm1, %zmm7, %zmm1
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: test_mul4x4_f64:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, %ymm4
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm1, %ymm5
-; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm6
-; AVX512VL-NEXT: vmulpd %ymm6, %ymm0, %ymm6
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[1,1,1,1]
-; AVX512VL-NEXT: vmulpd %ymm7, %ymm4, %ymm7
-; AVX512VL-NEXT: vaddpd %ymm7, %ymm6, %ymm6
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[2,2,2,2]
-; AVX512VL-NEXT: vmulpd %ymm7, %ymm1, %ymm7
-; AVX512VL-NEXT: vaddpd %ymm7, %ymm6, %ymm6
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm7 = ymm2[3,3,3,3]
-; AVX512VL-NEXT: vmulpd %ymm7, %ymm5, %ymm7
-; AVX512VL-NEXT: vaddpd %ymm7, %ymm6, %ymm6
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm2, %ymm2
-; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm7
-; AVX512VL-NEXT: vmulpd %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[1,1,1,1]
-; AVX512VL-NEXT: vmulpd %ymm4, %ymm8, %ymm8
-; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm2[2,2,2,2]
-; AVX512VL-NEXT: vmulpd %ymm1, %ymm8, %ymm8
-; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3]
-; AVX512VL-NEXT: vmulpd %ymm2, %ymm5, %ymm2
-; AVX512VL-NEXT: vaddpd %ymm2, %ymm7, %ymm2
-; AVX512VL-NEXT: vbroadcastsd %xmm3, %ymm7
-; AVX512VL-NEXT: vmulpd %ymm7, %ymm0, %ymm7
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
-; AVX512VL-NEXT: vmulpd %ymm4, %ymm8, %ymm8
-; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[2,2,2,2]
-; AVX512VL-NEXT: vmulpd %ymm1, %ymm8, %ymm8
-; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[3,3,3,3]
-; AVX512VL-NEXT: vmulpd %ymm5, %ymm8, %ymm8
-; AVX512VL-NEXT: vaddpd %ymm7, %ymm8, %ymm7
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm3, %ymm3
-; AVX512VL-NEXT: vbroadcastsd %xmm3, %ymm8
-; AVX512VL-NEXT: vmulpd %ymm0, %ymm8, %ymm0
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm8 = ymm3[1,1,1,1]
-; AVX512VL-NEXT: vmulpd %ymm4, %ymm8, %ymm4
-; AVX512VL-NEXT: vaddpd %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[2,2,2,2]
-; AVX512VL-NEXT: vmulpd %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[3,3,3,3]
-; AVX512VL-NEXT: vmulpd %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vaddpd %ymm1, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm6, %zmm0
-; AVX512VL-NEXT: vinsertf64x4 $1, %ymm1, %zmm7, %zmm1
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: test_mul4x4_f64:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vbroadcastsd %xmm2, %ymm4
+; AVX512-NEXT: vmulpd %ymm4, %ymm0, %ymm4
+; AVX512-NEXT: vextractf64x4 $1, %zmm2, %ymm5
+; AVX512-NEXT: vbroadcastsd %xmm5, %ymm5
+; AVX512-NEXT: vmulpd %ymm5, %ymm0, %ymm5
+; AVX512-NEXT: vbroadcastsd %xmm3, %ymm6
+; AVX512-NEXT: vmulpd %ymm6, %ymm0, %ymm6
+; AVX512-NEXT: vextractf64x4 $1, %zmm3, %ymm7
+; AVX512-NEXT: vbroadcastsd %xmm7, %ymm7
+; AVX512-NEXT: vmulpd %ymm7, %ymm0, %ymm7
+; AVX512-NEXT: vinsertf64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-NEXT: vpermpd {{.*#+}} zmm5 = zmm2[1,1,1,1,5,5,5,5]
+; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm8 = zmm0[4,5,6,7,4,5,6,7]
+; AVX512-NEXT: vmulpd %zmm5, %zmm8, %zmm0
+; AVX512-NEXT: vaddpd %zmm0, %zmm4, %zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm4
+; AVX512-NEXT: vpermpd {{.*#+}} zmm5 = zmm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vmulpd %zmm5, %zmm4, %zmm5
+; AVX512-NEXT: vaddpd %zmm5, %zmm0, %zmm0
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
+; AVX512-NEXT: vmulpd %zmm2, %zmm1, %zmm2
+; AVX512-NEXT: vaddpd %zmm2, %zmm0, %zmm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm7, %zmm6, %zmm2
+; AVX512-NEXT: vpermpd {{.*#+}} zmm5 = zmm3[1,1,1,1,5,5,5,5]
+; AVX512-NEXT: vmulpd %zmm5, %zmm8, %zmm5
+; AVX512-NEXT: vaddpd %zmm5, %zmm2, %zmm2
+; AVX512-NEXT: vpermpd {{.*#+}} zmm5 = zmm3[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vmulpd %zmm5, %zmm4, %zmm4
+; AVX512-NEXT: vaddpd %zmm4, %zmm2, %zmm2
+; AVX512-NEXT: vpermpd {{.*#+}} zmm3 = zmm3[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vmulpd %zmm3, %zmm1, %zmm1
+; AVX512-NEXT: vaddpd %zmm1, %zmm2, %zmm1
+; AVX512-NEXT: retq
entry:
%split = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%split1 = shufflevector <16 x double> %a0, <16 x double> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
More information about the llvm-commits
mailing list