[llvm] 5ab3bc0 - [X86][SSE] Add pmulh tests where the source ops are not generated from sign/zero-extends

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 25 05:15:02 PDT 2021


Author: Simon Pilgrim
Date: 2021-03-25T12:12:15Z
New Revision: 5ab3bc0683c0ee7848b3fe991c35d73d0c9a603e

URL: https://github.com/llvm/llvm-project/commit/5ab3bc0683c0ee7848b3fe991c35d73d0c9a603e
DIFF: https://github.com/llvm/llvm-project/commit/5ab3bc0683c0ee7848b3fe991c35d73d0c9a603e.diff

LOG: [X86][SSE] Add pmulh tests where the source ops are not generated from sign/zero-extends

Added: 
    

Modified: 
    llvm/test/CodeGen/X86/pmulh.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index aa0daff14f62..4c932f13595a 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -23,6 +23,71 @@ define <4 x i16> @zext_mulhuw_v4i16(<4 x i16> %a, <4 x i16> %b) {
   ret <4 x i16> %e
 }
 
+define <4 x i16> @and_mulhuw_v4i16(<4 x i64> %a, <4 x i64> %b) {
+; SSE2-LABEL: and_mulhuw_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [65535,0,0,0,65535,0,0,0]
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; SSE2-NEXT:    psrlq $16, %xmm0
+; SSE2-NEXT:    psrlq $16, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: and_mulhuw_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pxor %xmm4, %xmm4
+; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4],xmm4[5],xmm2[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4],xmm4[5],xmm0[6,7]
+; SSE41-NEXT:    pmuldq %xmm2, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4],xmm4[5],xmm3[6,7]
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3,4],xmm4[5],xmm1[6,7]
+; SSE41-NEXT:    pmuldq %xmm3, %xmm1
+; SSE41-NEXT:    psrlq $16, %xmm1
+; SSE41-NEXT:    psrlq $16, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: and_mulhuw_v4i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
+; AVX2-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrlq $16, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: and_mulhuw_v4i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
+; AVX512-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7,8],ymm2[9],ymm1[10,11,12],ymm2[13],ymm1[14,15]
+; AVX512-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrlq $16, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %a1 = and <4 x i64> %a, <i64 65535, i64 65535, i64 65535, i64 65535>
+  %b1 = and <4 x i64> %b, <i64 65535, i64 65535, i64 65535, i64 65535>
+  %c = mul <4 x i64> %a1, %b1
+  %d = lshr <4 x i64> %c, <i64 16, i64 16, i64 16, i64 16>
+  %e = trunc <4 x i64> %d to <4 x i16>
+  ret <4 x i16> %e
+}
+
 define <4 x i16> @sext_mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
 ; SSE-LABEL: sext_mulhw_v4i16:
 ; SSE:       # %bb.0:
@@ -41,6 +106,41 @@ define <4 x i16> @sext_mulhw_v4i16(<4 x i16> %a, <4 x i16> %b) {
   ret <4 x i16> %e
 }
 
+define <4 x i16> @ashr_mulhw_v4i16(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: ashr_mulhw_v4i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    packssdw %xmm1, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm0, %xmm0
+; SSE2-NEXT:    pmulhw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ashr_mulhw_v4i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm1
+; SSE41-NEXT:    pmulld %xmm1, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    packusdw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: ashr_mulhw_v4i16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %a1 = ashr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
+  %b1 = ashr <4 x i32> %b, <i32 16, i32 16, i32 16, i32 16>
+  %c = mul <4 x i32> %a1, %b1
+  %d = lshr <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
+  %e = trunc <4 x i32> %d to <4 x i16>
+  ret <4 x i16> %e
+}
+
 define <8 x i16> @zext_mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-LABEL: zext_mulhuw_v8i16:
 ; SSE:       # %bb.0:
@@ -59,6 +159,60 @@ define <8 x i16> @zext_mulhuw_v8i16(<8 x i16> %a, <8 x i16> %b) {
   ret <8 x i16> %e
 }
 
+define <8 x i16> @lshr_mulhuw_v8i16(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: lshr_mulhuw_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pmulhuw %xmm2, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: lshr_mulhuw_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psrld $16, %xmm1
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm3
+; SSE41-NEXT:    pmulld %xmm1, %xmm3
+; SSE41-NEXT:    psrld $16, %xmm2
+; SSE41-NEXT:    pmulld %xmm2, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm3
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    packusdw %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: lshr_mulhuw_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: lshr_mulhuw_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %a1 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %b1 = lshr <8 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %c = mul <8 x i32> %a1, %b1
+  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %e = trunc <8 x i32> %d to <8 x i16>
+  ret <8 x i16> %e
+}
+
 define <8 x i16> @sext_mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SSE-LABEL: sext_mulhw_v8i16:
 ; SSE:       # %bb.0:
@@ -77,6 +231,79 @@ define <8 x i16> @sext_mulhw_v8i16(<8 x i16> %a, <8 x i16> %b) {
   ret <8 x i16> %e
 }
 
+define <8 x i16> @sextinreg_mulhw_v8i16(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: sextinreg_mulhw_v8i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pslld $24, %xmm1
+; SSE2-NEXT:    psrad $24, %xmm1
+; SSE2-NEXT:    pslld $24, %xmm0
+; SSE2-NEXT:    psrad $24, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pslld $25, %xmm3
+; SSE2-NEXT:    psrad $25, %xmm3
+; SSE2-NEXT:    pslld $25, %xmm2
+; SSE2-NEXT:    psrad $25, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    pmullw %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: sextinreg_mulhw_v8i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    pslld $24, %xmm1
+; SSE41-NEXT:    psrad $24, %xmm1
+; SSE41-NEXT:    pslld $24, %xmm0
+; SSE41-NEXT:    psrad $24, %xmm0
+; SSE41-NEXT:    pslld $25, %xmm3
+; SSE41-NEXT:    psrad $25, %xmm3
+; SSE41-NEXT:    pmulld %xmm1, %xmm3
+; SSE41-NEXT:    pslld $25, %xmm2
+; SSE41-NEXT:    psrad $25, %xmm2
+; SSE41-NEXT:    pmulld %xmm2, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm3
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    packusdw %xmm3, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: sextinreg_mulhw_v8i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpslld $24, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $24, %ymm0, %ymm0
+; AVX2-NEXT:    vpslld $25, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $25, %ymm1, %ymm1
+; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sextinreg_mulhw_v8i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpslld $24, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrad $24, %ymm0, %ymm0
+; AVX512-NEXT:    vpslld $25, %ymm1, %ymm1
+; AVX512-NEXT:    vpsrad $25, %ymm1, %ymm1
+; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %a1 = shl <8 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %b1 = shl <8 x i32> %b, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %a2 = ashr <8 x i32> %a1, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
+  %b2 = ashr <8 x i32> %b1, <i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25, i32 25>
+  %c = mul <8 x i32> %a2, %b2
+  %d = lshr <8 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %e = trunc <8 x i32> %d to <8 x i16>
+  ret <8 x i16> %e
+}
+
 define <16 x i16> @zext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: zext_mulhuw_v16i16:
 ; SSE:       # %bb.0:
@@ -96,6 +323,103 @@ define <16 x i16> @zext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %e
 }
 
+define <16 x i16> @and_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: and_mulhuw_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
+; SSE2-NEXT:    pand %xmm8, %xmm3
+; SSE2-NEXT:    pand %xmm8, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm8, %xmm1
+; SSE2-NEXT:    pand %xmm8, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm8, %xmm7
+; SSE2-NEXT:    pand %xmm8, %xmm6
+; SSE2-NEXT:    packssdw %xmm7, %xmm6
+; SSE2-NEXT:    pmulhw %xmm2, %xmm6
+; SSE2-NEXT:    pand %xmm8, %xmm5
+; SSE2-NEXT:    pand %xmm8, %xmm4
+; SSE2-NEXT:    packssdw %xmm5, %xmm4
+; SSE2-NEXT:    pmulhw %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2-NEXT:    packssdw %xmm1, %xmm6
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    packssdw %xmm1, %xmm4
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: and_mulhuw_v16i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [32767,32767,32767,32767]
+; SSE41-NEXT:    pand %xmm8, %xmm3
+; SSE41-NEXT:    pand %xmm8, %xmm2
+; SSE41-NEXT:    pand %xmm8, %xmm1
+; SSE41-NEXT:    pand %xmm8, %xmm0
+; SSE41-NEXT:    pand %xmm8, %xmm7
+; SSE41-NEXT:    pmaddwd %xmm3, %xmm7
+; SSE41-NEXT:    pand %xmm8, %xmm6
+; SSE41-NEXT:    pmaddwd %xmm2, %xmm6
+; SSE41-NEXT:    pand %xmm8, %xmm5
+; SSE41-NEXT:    pmaddwd %xmm1, %xmm5
+; SSE41-NEXT:    pand %xmm8, %xmm4
+; SSE41-NEXT:    pmaddwd %xmm4, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm7
+; SSE41-NEXT:    psrld $16, %xmm6
+; SSE41-NEXT:    packusdw %xmm7, %xmm6
+; SSE41-NEXT:    psrld $16, %xmm5
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    packusdw %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: and_mulhuw_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm4 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX2-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm3
+; AVX2-NEXT:    vpmaddwd %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpand %ymm4, %ymm2, %ymm2
+; AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: and_mulhuw_v16i16:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512F-NEXT:    vpandd %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vpandd %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsrld $16, %zmm0, %zmm0
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: and_mulhuw_v16i16:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} zmm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
+; AVX512BW-NEXT:    vpandd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpandd %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrld $16, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+  %a1 = and <16 x i32> %a, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %b1 = and <16 x i32> %b, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
+  %c = mul <16 x i32> %a1, %b1
+  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %e = trunc <16 x i32> %d to <16 x i16>
+  ret <16 x i16> %e
+}
+
 define <16 x i16> @sext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SSE-LABEL: sext_mulhuw_v16i16:
 ; SSE:       # %bb.0:
@@ -115,6 +439,79 @@ define <16 x i16> @sext_mulhuw_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %e
 }
 
+define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
+; SSE2-LABEL: ashr_mulhuw_v16i16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    psrad $16, %xmm5
+; SSE2-NEXT:    psrad $16, %xmm4
+; SSE2-NEXT:    packssdw %xmm5, %xmm4
+; SSE2-NEXT:    psrad $16, %xmm1
+; SSE2-NEXT:    psrad $16, %xmm0
+; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    pmulhw %xmm4, %xmm0
+; SSE2-NEXT:    psrad $16, %xmm7
+; SSE2-NEXT:    psrad $16, %xmm6
+; SSE2-NEXT:    packssdw %xmm7, %xmm6
+; SSE2-NEXT:    psrad $16, %xmm3
+; SSE2-NEXT:    psrad $16, %xmm2
+; SSE2-NEXT:    packssdw %xmm3, %xmm2
+; SSE2-NEXT:    pmulhw %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: ashr_mulhuw_v16i16:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    psrad $16, %xmm3
+; SSE41-NEXT:    psrad $16, %xmm2
+; SSE41-NEXT:    psrad $16, %xmm1
+; SSE41-NEXT:    psrad $16, %xmm0
+; SSE41-NEXT:    psrad $16, %xmm7
+; SSE41-NEXT:    pmulld %xmm3, %xmm7
+; SSE41-NEXT:    psrad $16, %xmm6
+; SSE41-NEXT:    pmulld %xmm2, %xmm6
+; SSE41-NEXT:    psrad $16, %xmm5
+; SSE41-NEXT:    pmulld %xmm1, %xmm5
+; SSE41-NEXT:    psrad $16, %xmm4
+; SSE41-NEXT:    pmulld %xmm4, %xmm0
+; SSE41-NEXT:    psrld $16, %xmm7
+; SSE41-NEXT:    psrld $16, %xmm6
+; SSE41-NEXT:    packusdw %xmm7, %xmm6
+; SSE41-NEXT:    psrld $16, %xmm5
+; SSE41-NEXT:    psrld $16, %xmm0
+; SSE41-NEXT:    packusdw %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm6, %xmm1
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: ashr_mulhuw_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpsrad $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $16, %ymm3, %ymm3
+; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrad $16, %ymm2, %ymm2
+; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpsrld $16, %ymm1, %ymm1
+; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ashr_mulhuw_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrad $16, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrad $16, %zmm1, %zmm1
+; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    retq
+  %a1 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %b1 = ashr <16 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %c = mul <16 x i32> %a1, %b1
+  %d = lshr <16 x i32> %c, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %e = trunc <16 x i32> %d to <16 x i16>
+  ret <16 x i16> %e
+}
+
 define <32 x i16> @zext_mulhuw_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; SSE-LABEL: zext_mulhuw_v32i16:
 ; SSE:       # %bb.0:


        


More information about the llvm-commits mailing list