[llvm] r317086 - [X86][SSE] Truncate with PACKSS any input with sufficient sign-bits
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 1 04:47:44 PDT 2017
Author: rksimon
Date: Wed Nov 1 04:47:44 2017
New Revision: 317086
URL: http://llvm.org/viewvc/llvm-project?rev=317086&view=rev
Log:
[X86][SSE] Truncate with PACKSS any input with sufficient sign-bits
So far we've only been using PACKSS truncations with 'all-bits or zero-bits' patterns (vector comparison results etc.). When really we can safely use it for any case as long as the number of sign bits reach down to the last 16-bits (or 8-bits if we're truncating to bytes).
The next steps after this is add the equivalent support for PACKUS and to support packing to sub-128 bit vectors for truncating stores etc.
Differential Revision: https://reviews.llvm.org/D39476
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avg.ll
llvm/trunk/test/CodeGen/X86/avx2-shift.ll
llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll
llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll
llvm/trunk/test/CodeGen/X86/vector-trunc.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=317086&r1=317085&r2=317086&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Nov 1 04:47:44 2017
@@ -16222,8 +16222,10 @@ SDValue X86TargetLowering::LowerTRUNCATE
return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
}
- // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
- if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
+ // Truncate with PACKSS if we are truncating a vector with sign-bits that
+ // extend all the way to the packed/truncated value.
+ unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
+ if ((InVT.getScalarSizeInBits() - NumPackedBits) < DAG.ComputeNumSignBits(In))
if (SDValue V = truncateVectorWithPACKSS(VT, In, DL, DAG, Subtarget))
return V;
@@ -34422,7 +34424,7 @@ static SDValue combineVectorTruncation(S
return SDValue();
}
-/// This function transforms vector truncation of 'all or none' bits values.
+/// This function transforms vector truncation of 'extended sign-bits' values.
/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
SelectionDAG &DAG,
@@ -34444,12 +34446,6 @@ static SDValue combineVectorSignBitsTrun
MVT InVT = In.getValueType().getSimpleVT();
MVT InSVT = InVT.getScalarType();
- // Use PACKSS if the input is a splatted sign bit.
- // e.g. Comparison result, sext_in_reg, etc.
- unsigned NumSignBits = DAG.ComputeNumSignBits(In);
- if (NumSignBits != InSVT.getSizeInBits())
- return SDValue();
-
// Check we have a truncation suited for PACKSS.
if (!VT.is128BitVector() && !VT.is256BitVector())
return SDValue();
@@ -34458,6 +34454,13 @@ static SDValue combineVectorSignBitsTrun
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
+ // Use PACKSS if the input has sign-bits that extend all the way to the
+ // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
+ unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+ unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
+ if (NumSignBits <= (InSVT.getSizeInBits() - NumPackedBits))
+ return SDValue();
+
return truncateVectorWithPACKSS(VT, In, DL, DAG, Subtarget);
}
Modified: llvm/trunk/test/CodeGen/X86/avg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg.ll?rev=317086&r1=317085&r2=317086&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avg.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avg.ll Wed Nov 1 04:47:44 2017
@@ -2567,52 +2567,51 @@ define void @avg_v64i8_const(<64 x i8>*
; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6
; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5
; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4
-; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3
+; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm9
; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2
; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1
; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0
-; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $1, %ymm0, %ymm10
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $1, %ymm2, %ymm8
-; AVX2-NEXT: vpsrld $1, %ymm3, %ymm9
+; AVX2-NEXT: vpsrld $1, %ymm2, %ymm3
+; AVX2-NEXT: vpsrld $1, %ymm9, %ymm8
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5
; AVX2-NEXT: vpsrld $1, %ymm6, %ymm6
-; AVX2-NEXT: vpsrld $1, %ymm7, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm7
-; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm6
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX2-NEXT: vpsrld $1, %ymm7, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX2-NEXT: vpackssdw %xmm7, %xmm2, %xmm7
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm0
+; AVX2-NEXT: vpackssdw %xmm0, %xmm6, %xmm0
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
+; AVX2-NEXT: vpshufb %xmm2, %xmm5, %xmm5
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4
-; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm6
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm6[0],xmm5[0]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm8, %xmm4
+; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm4
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX2-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm4
+; AVX2-NEXT: vpackssdw %xmm4, %xmm10, %xmm4
+; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqu %ymm1, (%rax)
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
-; AVX2-NEXT: vmovdqu %ymm4, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/avx2-shift.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-shift.ll?rev=317086&r1=317085&r2=317086&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-shift.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-shift.ll Wed Nov 1 04:47:44 2017
@@ -556,9 +556,8 @@ define <8 x i16> @variable_ashr16(<8 x i
; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X32-NEXT: vpmovsxwd %xmm0, %ymm0
; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X32-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
@@ -567,9 +566,8 @@ define <8 x i16> @variable_ashr16(<8 x i
; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X64-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X64-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%res = ashr <8 x i16> %lhs, %rhs
Modified: llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll?rev=317086&r1=317085&r2=317086&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-vector-shifts.ll Wed Nov 1 04:47:44 2017
@@ -499,9 +499,8 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r
; X32-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X32-NEXT: vpmovsxwd %xmm0, %ymm0
; X32-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X32-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; X32-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X32-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X32-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X32-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X32-NEXT: vzeroupper
; X32-NEXT: retl
;
@@ -510,9 +509,8 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r
; X64-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; X64-NEXT: vpmovsxwd %xmm0, %ymm0
; X64-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; X64-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; X64-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; X64-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; X64-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X64-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; X64-NEXT: vzeroupper
; X64-NEXT: retq
%ashr = ashr <8 x i16> %r, %a
Modified: llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll?rev=317086&r1=317085&r2=317086&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll Wed Nov 1 04:47:44 2017
@@ -318,9 +318,8 @@ define <8 x i16> @var_shift_v8i16(<8 x i
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1249,9 +1248,8 @@ define <8 x i16> @constant_shift_v8i16(<
; AVX2: # BB#0:
; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX2-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc.ll?rev=317086&r1=317085&r2=317086&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc.ll Wed Nov 1 04:47:44 2017
@@ -404,9 +404,8 @@ define <8 x i16> @trunc8i32_8i16_ashr(<8
; AVX2-LABEL: trunc8i32_8i16_ashr:
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
-; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -695,62 +694,28 @@ entry:
}
define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
-; SSE2-LABEL: trunc16i32_16i16_ashr:
-; SSE2: # BB#0: # %entry
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: movdqu %xmm2, (%rax)
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc16i32_16i16_ashr:
-; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: psrad $16, %xmm3
-; SSSE3-NEXT: psrad $16, %xmm2
-; SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: movdqu %xmm2, (%rax)
-; SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc16i32_16i16_ashr:
-; SSE41: # BB#0: # %entry
-; SSE41-NEXT: psrad $16, %xmm2
-; SSE41-NEXT: psrad $16, %xmm3
-; SSE41-NEXT: psrad $16, %xmm0
-; SSE41-NEXT: psrad $16, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: movdqu %xmm2, (%rax)
-; SSE41-NEXT: movdqu %xmm0, (%rax)
-; SSE41-NEXT: retq
+; SSE-LABEL: trunc16i32_16i16_ashr:
+; SSE: # BB#0: # %entry
+; SSE-NEXT: psrad $16, %xmm3
+; SSE-NEXT: psrad $16, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: psrad $16, %xmm1
+; SSE-NEXT: psrad $16, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
+; SSE-NEXT: movdqu %xmm2, (%rax)
+; SSE-NEXT: movdqu %xmm0, (%rax)
+; SSE-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i16_ashr:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $16, %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovups %ymm0, (%rax)
; AVX1-NEXT: vzeroupper
@@ -760,11 +725,8 @@ define void @trunc16i32_16i16_ashr(<16 x
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1
; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
-; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vmovdqu %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -935,53 +897,39 @@ entry:
define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
; SSE-LABEL: trunc16i32_16i8_ashr:
; SSE: # BB#0: # %entry
-; SSE-NEXT: psrad $24, %xmm0
; SSE-NEXT: psrad $24, %xmm1
-; SSE-NEXT: psrad $24, %xmm2
+; SSE-NEXT: psrad $24, %xmm0
+; SSE-NEXT: packssdw %xmm1, %xmm0
; SSE-NEXT: psrad $24, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: psrad $24, %xmm2
+; SSE-NEXT: packssdw %xmm3, %xmm2
+; SSE-NEXT: packsswb %xmm2, %xmm0
; SSE-NEXT: movdqu %xmm0, (%rax)
; SSE-NEXT: retq
;
; AVX1-LABEL: trunc16i32_16i8_ashr:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $24, %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc16i32_16i8_ashr:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1
-; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
+; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1031,12 +979,12 @@ define void @trunc16i32_16i8_lshr(<16 x
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1
-; AVX2-NEXT: vpackssdw %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpackssdw %ymm0, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
@@ -1153,12 +1101,10 @@ define void @trunc16i16_16i8_ashr(<16 x
;
; AVX1-LABEL: trunc16i16_16i8_ashr:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1167,10 +1113,7 @@ define void @trunc16i16_16i8_ashr(<16 x
; AVX2: # BB#0: # %entry
; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
More information about the llvm-commits
mailing list