[llvm] 19d34f6 - [X86] combinePMULH - recognise 'cheap' trunctions via PACKS/PACKUS as well as SEXT/ZEXT
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 1 08:38:06 PST 2021
Author: Simon Pilgrim
Date: 2021-12-01T16:37:49Z
New Revision: 19d34f6e95fe94928ec275168d4d69d89aa54bbc
URL: https://github.com/llvm/llvm-project/commit/19d34f6e95fe94928ec275168d4d69d89aa54bbc
DIFF: https://github.com/llvm/llvm-project/commit/19d34f6e95fe94928ec275168d4d69d89aa54bbc.diff
LOG: [X86] combinePMULH - recognise 'cheap' trunctions via PACKS/PACKUS as well as SEXT/ZEXT
combinePMULH currently only truncates vXi32/vXi64 multiplies to PMULHW/PMULUW if the source operands are SEXT/ZEXT instructions for a 'free' truncation.
But we can generalize this to any source operand with sufficient leading sign/zero bits that would allow PACKS/PACKUS to be used as a 'cheap' truncation.
This helps us avoid the wider multiplies, in exchange for truncation on both source operands instead of the result.
Differential Revision: https://reviews.llvm.org/D113371
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/pmulh.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9048c24a7af88..df5a041b87cd3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4464,6 +4464,9 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
SDLoc DL(N);
if (VT.isVector()) {
+ if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+ return FoldedVOp;
+
// fold (mulhs x, 0) -> 0
// do not return N0/N1, because undef node may exist.
if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) ||
@@ -4521,6 +4524,9 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
SDLoc DL(N);
if (VT.isVector()) {
+ if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
+ return FoldedVOp;
+
// fold (mulhu x, 0) -> 0
// do not return N0/N1, because undef node may exist.
if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) ||
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c45060aba8b02..8b6fd00d31744 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48562,20 +48562,50 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
SDValue LHS = Src.getOperand(0).getOperand(0);
SDValue RHS = Src.getOperand(0).getOperand(1);
- unsigned ExtOpc = LHS.getOpcode();
- if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
- RHS.getOpcode() != ExtOpc)
- return SDValue();
-
- // Peek through the extends.
- LHS = LHS.getOperand(0);
- RHS = RHS.getOperand(0);
-
- // Ensure the input types match.
- if (LHS.getValueType() != VT || RHS.getValueType() != VT)
- return SDValue();
+ // Count leading sign/zero bits on both inputs - if there are enough then
+ // truncation back to vXi16 will be cheap - either as a pack/shuffle
+ // sequence or using AVX512 truncations. If the inputs are sext/zext then the
+ // truncations may actually be free by peeking through to the ext source.
+ auto IsSext = [&DAG](SDValue V) {
+ return DAG.ComputeMinSignedBits(V) <= 16;
+ };
+ auto IsZext = [&DAG](SDValue V) {
+ return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
+ };
- unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
+ bool IsSigned = IsSext(LHS) && IsSext(RHS);
+ bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
+ if (!IsSigned && !IsUnsigned)
+ return SDValue();
+
+ // Check if both inputs are extensions, which will be removed by truncation.
+ bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
+ LHS.getOpcode() == ISD::ZERO_EXTEND) &&
+ (RHS.getOpcode() == ISD::SIGN_EXTEND ||
+ RHS.getOpcode() == ISD::ZERO_EXTEND) &&
+ LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
+ RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
+
+ // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
+ // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
+ // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
+ // will have to split anyway.
+ unsigned InSizeInBits = InVT.getSizeInBits();
+ if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
+ !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
+ (InSizeInBits % 16) == 0) {
+ EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+ InVT.getSizeInBits() / 16);
+ SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
+ DAG.getBitcast(BCVT, RHS));
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
+ }
+
+ // Truncate back to source type.
+ LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
+ RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
+
+ unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
return DAG.getNode(Opc, DL, VT, LHS, RHS);
}
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 2e74e7f5f9e39..6d28b75fd1984 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -26,44 +26,39 @@ define <4 x i16> @zext_mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) {
; SSE2-LABEL: and_mulhuw_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,0,0,0,65535,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: psrlq $16, %xmm0
-; SSE2-NEXT: psrlq $16, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT: pmulhuw %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: and_mulhuw_v4i16:
; SSE41: # %bb.0:
; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4],xmm4[5],xmm2[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4],xmm4[5],xmm0[6,7]
-; SSE41-NEXT: pmuldq %xmm2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4],xmm4[5],xmm3[6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4],xmm4[5],xmm1[6,7]
-; SSE41-NEXT: pmuldq %xmm3, %xmm1
-; SSE41-NEXT: psrlq $16, %xmm1
-; SSE41-NEXT: psrlq $16, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
+; SSE41-NEXT: packusdw %xmm3, %xmm2
+; SSE41-NEXT: packusdw %xmm2, %xmm2
+; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: packusdw %xmm0, %xmm0
+; SSE41-NEXT: pmulhuw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX2-LABEL: and_mulhuw_v4i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
-; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlq $16, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -73,10 +68,9 @@ define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) {
; AVX512-LABEL: and_mulhuw_v4i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT: vpmuldq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm0
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovqw %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -118,21 +112,29 @@ define <4 x i16> @ashr_mulhw_v4i16(<4 x i32> %a, <4 x i32> %b) {
;
; SSE41-LABEL: ashr_mulhw_v4i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmaddwd %xmm1, %xmm0
; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: psrld $16, %xmm1
+; SSE41-NEXT: packusdw %xmm1, %xmm1
; SSE41-NEXT: packusdw %xmm0, %xmm0
+; SSE41-NEXT: pmulhw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: ashr_mulhw_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX2-LABEL: ashr_mulhw_v4i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: ashr_mulhw_v4i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: retq
%a1 = ashr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
%b1 = ashr <4 x i32> %b, <i32 16, i32 16, i32 16, i32 16>
%c = mul <4 x i32> %a1, %b1
@@ -175,21 +177,18 @@ define <8 x i16> @lshr_mulhuw_v8i16(<8 x i32> %a, <8 x i32> %b) {
; SSE41: # %bb.0:
; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: psrld $16, %xmm0
+; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: psrld $16, %xmm3
-; SSE41-NEXT: pmulld %xmm1, %xmm3
; SSE41-NEXT: psrld $16, %xmm2
-; SSE41-NEXT: pmulld %xmm2, %xmm0
-; SSE41-NEXT: psrld $16, %xmm3
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: packusdw %xmm3, %xmm0
+; SSE41-NEXT: packusdw %xmm3, %xmm2
+; SSE41-NEXT: pmulhuw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX2-LABEL: lshr_mulhuw_v8i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
@@ -199,8 +198,7 @@ define <8 x i16> @lshr_mulhuw_v8i16(<8 x i32> %a, <8 x i32> %b) {
; AVX512: # %bb.0:
; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX512-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
@@ -232,42 +230,20 @@ define <8 x i16> @sext_mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
}
define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) {
-; SSE2-LABEL: sextinreg_mulhw_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $24, %xmm1
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pslld $24, %xmm0
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: pslld $25, %xmm3
-; SSE2-NEXT: psrad $25, %xmm3
-; SSE2-NEXT: pslld $25, %xmm2
-; SSE2-NEXT: psrad $25, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sextinreg_mulhw_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $24, %xmm1
-; SSE41-NEXT: psrad $24, %xmm1
-; SSE41-NEXT: pslld $24, %xmm0
-; SSE41-NEXT: psrad $24, %xmm0
-; SSE41-NEXT: pslld $25, %xmm3
-; SSE41-NEXT: psrad $25, %xmm3
-; SSE41-NEXT: pmulld %xmm1, %xmm3
-; SSE41-NEXT: pslld $25, %xmm2
-; SSE41-NEXT: psrad $25, %xmm2
-; SSE41-NEXT: pmulld %xmm2, %xmm0
-; SSE41-NEXT: psrld $16, %xmm3
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: packusdw %xmm3, %xmm0
-; SSE41-NEXT: retq
+; SSE-LABEL: sextinreg_mulhw_v8i16:
+; SSE: # %bb.0:
+; SSE-NEXT: pslld $24, %xmm1
+; SSE-NEXT: psrad $24, %xmm1
+; SSE-NEXT: pslld $24, %xmm0
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: pslld $25, %xmm3
+; SSE-NEXT: psrad $25, %xmm3
+; SSE-NEXT: pslld $25, %xmm2
+; SSE-NEXT: psrad $25, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: pmulhw %xmm2, %xmm0
+; SSE-NEXT: retq
;
; AVX2-LABEL: sextinreg_mulhw_v8i16:
; AVX2: # %bb.0:
@@ -275,10 +251,11 @@ define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) {
; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
; AVX2-NEXT: vpslld $25, %ymm1, %ymm1
; AVX2-NEXT: vpsrad $25, %ymm1, %ymm1
-; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -288,10 +265,9 @@ define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) {
; AVX512-NEXT: vpsrad $24, %ymm0, %ymm0
; AVX512-NEXT: vpslld $25, %ymm1, %ymm1
; AVX512-NEXT: vpsrad $25, %ymm1, %ymm1
-; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %zmm1, %ymm1
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%a1 = shl <8 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
@@ -348,22 +324,18 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767]
; SSE2-NEXT: pand %xmm6, %xmm3
; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
; SSE2-NEXT: pand %xmm6, %xmm1
; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pmaddwd %xmm3, %xmm7
; SSE2-NEXT: pand %xmm6, %xmm8
-; SSE2-NEXT: pmaddwd %xmm2, %xmm8
+; SSE2-NEXT: packssdw %xmm7, %xmm8
+; SSE2-NEXT: pmulhw %xmm2, %xmm8
; SSE2-NEXT: pand %xmm6, %xmm5
-; SSE2-NEXT: pmaddwd %xmm1, %xmm5
; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pmaddwd %xmm6, %xmm0
-; SSE2-NEXT: psrld $16, %xmm7
-; SSE2-NEXT: psrld $16, %xmm8
-; SSE2-NEXT: packssdw %xmm7, %xmm8
-; SSE2-NEXT: psrld $16, %xmm5
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: packssdw %xmm5, %xmm0
+; SSE2-NEXT: packssdw %xmm5, %xmm6
+; SSE2-NEXT: pmulhw %xmm6, %xmm0
; SSE2-NEXT: movdqa %xmm8, %xmm1
; SSE2-NEXT: retq
;
@@ -373,48 +345,42 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [32767,32767,32767,32767]
; SSE41-NEXT: pand %xmm6, %xmm3
; SSE41-NEXT: pand %xmm6, %xmm2
+; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: pand %xmm6, %xmm1
; SSE41-NEXT: pand %xmm6, %xmm0
+; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: pand %xmm6, %xmm7
-; SSE41-NEXT: pmaddwd %xmm3, %xmm7
; SSE41-NEXT: pand %xmm6, %xmm8
-; SSE41-NEXT: pmaddwd %xmm2, %xmm8
+; SSE41-NEXT: packusdw %xmm7, %xmm8
+; SSE41-NEXT: pmulhw %xmm2, %xmm8
; SSE41-NEXT: pand %xmm6, %xmm5
-; SSE41-NEXT: pmaddwd %xmm1, %xmm5
; SSE41-NEXT: pand %xmm4, %xmm6
-; SSE41-NEXT: pmaddwd %xmm6, %xmm0
-; SSE41-NEXT: psrld $16, %xmm7
-; SSE41-NEXT: psrld $16, %xmm8
-; SSE41-NEXT: packusdw %xmm7, %xmm8
-; SSE41-NEXT: psrld $16, %xmm5
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: packusdw %xmm5, %xmm0
+; SSE41-NEXT: packusdw %xmm5, %xmm6
+; SSE41-NEXT: pmulhw %xmm6, %xmm0
; SSE41-NEXT: movdqa %xmm8, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: and_mulhuw_v16i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767]
-; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2
; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vpmulhuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm2
+; AVX2-NEXT: vpmulhuw %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
; AVX512F-LABEL: and_mulhuw_v16i16:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
-; AVX512F-NEXT: vpandd %zmm2, %zmm0, %zmm0
-; AVX512F-NEXT: vpandd %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vpmulld %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: and_mulhuw_v16i16:
@@ -422,8 +388,7 @@ define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
; AVX512BW-NEXT: vpandd %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpandd %zmm2, %zmm1, %zmm1
-; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
; AVX512BW-NEXT: retq
%a1 = and <16 x i32> %a, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
@@ -475,58 +440,43 @@ define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
;
; SSE41-LABEL: ashr_mulhuw_v16i16:
; SSE41: # %bb.0:
-; SSE41-NEXT: psrld $16, %xmm4
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmaddwd %xmm4, %xmm0
-; SSE41-NEXT: psrld $16, %xmm5
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmaddwd %xmm5, %xmm1
-; SSE41-NEXT: psrld $16, %xmm6
-; SSE41-NEXT: psrld $16, %xmm2
-; SSE41-NEXT: pmaddwd %xmm6, %xmm2
-; SSE41-NEXT: psrld $16, %xmm7
-; SSE41-NEXT: psrld $16, %xmm3
-; SSE41-NEXT: pmaddwd %xmm7, %xmm3
; SSE41-NEXT: psrld $16, %xmm3
; SSE41-NEXT: psrld $16, %xmm2
; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: psrld $16, %xmm1
; SSE41-NEXT: psrld $16, %xmm0
; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
+; SSE41-NEXT: psrld $16, %xmm7
+; SSE41-NEXT: psrld $16, %xmm6
+; SSE41-NEXT: packusdw %xmm7, %xmm6
+; SSE41-NEXT: pmulhw %xmm2, %xmm6
+; SSE41-NEXT: psrld $16, %xmm5
+; SSE41-NEXT: psrld $16, %xmm4
+; SSE41-NEXT: packusdw %xmm5, %xmm4
+; SSE41-NEXT: pmulhw %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm6, %xmm1
; SSE41-NEXT: retq
;
; AVX2-LABEL: ashr_mulhuw_v16i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $16, %ymm3, %ymm2
-; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $16, %ymm3, %ymm1
+; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
+; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: ashr_mulhuw_v16i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsrad $16, %zmm0, %zmm0
-; AVX512F-NEXT: vpsrad $16, %zmm1, %zmm1
-; AVX512F-NEXT: vpmulld %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: ashr_mulhuw_v16i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrld $16, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: retq
+; AVX512-LABEL: ashr_mulhuw_v16i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
+; AVX512-NEXT: vpsrld $16, %zmm1, %zmm1
+; AVX512-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512-NEXT: vpmulhw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: retq
%a1 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%b1 = ashr <16 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%c = mul <16 x i32> %a1, %b1
More information about the llvm-commits
mailing list