[llvm] f471f6f - [X86] combineTruncateWithSat - relax minimum truncation size for PACKSS/PACKUS

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 1 03:33:48 PDT 2023


Author: Simon Pilgrim
Date: 2023-11-01T10:33:35Z
New Revision: f471f6ff2f02a5081f89e0daf18c2ee9f3dc103d

URL: https://github.com/llvm/llvm-project/commit/f471f6ff2f02a5081f89e0daf18c2ee9f3dc103d
DIFF: https://github.com/llvm/llvm-project/commit/f471f6ff2f02a5081f89e0daf18c2ee9f3dc103d.diff

LOG: [X86] combineTruncateWithSat - relax minimum truncation size for PACKSS/PACKUS

truncateVectorWithPACK handling of sub-128-bit result types was improved some time ago, so remove the old 64-bit limit

Fixes #68466

Added: 
    

Modified: 
    llvm/lib/Target/X86/X86ISelLowering.cpp
    llvm/test/CodeGen/X86/fpclamptosat_vec.ll
    llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
    llvm/test/CodeGen/X86/vector-trunc-packus.ll
    llvm/test/CodeGen/X86/vector-trunc-ssat.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 18f6a695e4502e9..9a3e1e9bd3233c5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49604,14 +49604,12 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
                       (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
                       !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
 
-  if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
-      VT.getSizeInBits() >= 64 &&
+  if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
+      isPowerOf2_32(VT.getVectorNumElements()) &&
       (SVT == MVT::i8 || SVT == MVT::i16) &&
       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
     if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
-      // Only do this when the result is at least 64 bits or we'll leaving
-      // dangling PACKSSDW nodes.
       if (SVT == MVT::i8 && InSVT == MVT::i32) {
         EVT MidVT = VT.changeVectorElementType(MVT::i16);
         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,

diff  --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 017fe14366bd679..78ccc983d1637a5 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -1092,38 +1092,14 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) nounwind {
 ; SSE-LABEL: stest_f64i16:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: stest_f64i16:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: stest_f64i16:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: stest_f64i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 32767, i32 32767>
@@ -1198,24 +1174,11 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) nounwind {
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: ustest_f64i16:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ustest_f64i16:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: ustest_f64i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 65535, i32 65535>
@@ -1652,40 +1615,16 @@ define <2 x i8> @stest_f64i8(<2 x double> %x) nounwind {
 ; SSE-LABEL: stest_f64i8:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <127,127,u,u>
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <4294967168,4294967168,u,u>
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    packsswb %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: stest_f64i8:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: stest_f64i8:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: stest_f64i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 127, i32 127>
@@ -1748,39 +1687,16 @@ define <2 x i8> @ustest_f64i8(<2 x double> %x) nounwind {
 ; SSE-LABEL: ustest_f64i8:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <255,255,u,u>
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: ustest_f64i8:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ustest_f64i8:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: ustest_f64i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 255, i32 255>
@@ -1795,37 +1711,16 @@ define <4 x i8> @stest_f32i8(<4 x float> %x) nounwind {
 ; SSE-LABEL: stest_f32i8:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    packsswb %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: stest_f32i8:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: stest_f32i8:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpmovsdb %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: stest_f32i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %0 = icmp slt <4 x i32> %conv, <i32 127, i32 127, i32 127, i32 127>
@@ -1888,37 +1783,16 @@ define <4 x i8> @ustest_f32i8(<4 x float> %x) nounwind {
 ; SSE-LABEL: ustest_f32i8:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: ustest_f32i8:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ustest_f32i8:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovusdb %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: ustest_f32i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %0 = icmp slt <4 x i32> %conv, <i32 255, i32 255, i32 255, i32 255>
@@ -3863,38 +3737,14 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) nounwind {
 ; SSE-LABEL: stest_f64i16_mm:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: stest_f64i16_mm:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: stest_f64i16_mm:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: stest_f64i16_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> <i32 32767, i32 32767>)
@@ -3966,24 +3816,11 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) nounwind {
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: ustest_f64i16_mm:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ustest_f64i16_mm:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: ustest_f64i16_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> <i32 65535, i32 65535>)

diff  --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 38abaf8ff11c6c4..bd1e6d320b69e12 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -5166,25 +5166,13 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-LABEL: truncstore_v4i32_v4i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [127,127,127,127]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm4
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    packssdw %xmm3, %xmm3
-; SSE2-NEXT:    packsswb %xmm3, %xmm3
+; SSE2-NEXT:    packssdw %xmm0, %xmm0
+; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
 ; SSE2-NEXT:    xorl $15, %ecx
 ; SSE2-NEXT:    testb $1, %cl
-; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    jne .LBB14_1
 ; SSE2-NEXT:  # %bb.2: # %else
 ; SSE2-NEXT:    testb $2, %cl
@@ -5219,8 +5207,6 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-LABEL: truncstore_v4i32_v4i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pxor %xmm2, %xmm2
-; SSE4-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT:    pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE4-NEXT:    packssdw %xmm0, %xmm0
 ; SSE4-NEXT:    packsswb %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
@@ -5255,92 +5241,49 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    pextrb $3, %xmm0, 3(%rdi)
 ; SSE4-NEXT:    retq
 ;
-; AVX1-LABEL: truncstore_v4i32_v4i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    jne .LBB14_1
-; AVX1-NEXT:  # %bb.2: # %else
-; AVX1-NEXT:    testb $2, %al
-; AVX1-NEXT:    jne .LBB14_3
-; AVX1-NEXT:  .LBB14_4: # %else2
-; AVX1-NEXT:    testb $4, %al
-; AVX1-NEXT:    jne .LBB14_5
-; AVX1-NEXT:  .LBB14_6: # %else4
-; AVX1-NEXT:    testb $8, %al
-; AVX1-NEXT:    jne .LBB14_7
-; AVX1-NEXT:  .LBB14_8: # %else6
-; AVX1-NEXT:    retq
-; AVX1-NEXT:  .LBB14_1: # %cond.store
-; AVX1-NEXT:    vpextrb $0, %xmm0, (%rdi)
-; AVX1-NEXT:    testb $2, %al
-; AVX1-NEXT:    je .LBB14_4
-; AVX1-NEXT:  .LBB14_3: # %cond.store1
-; AVX1-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
-; AVX1-NEXT:    testb $4, %al
-; AVX1-NEXT:    je .LBB14_6
-; AVX1-NEXT:  .LBB14_5: # %cond.store3
-; AVX1-NEXT:    vpextrb $2, %xmm0, 2(%rdi)
-; AVX1-NEXT:    testb $8, %al
-; AVX1-NEXT:    je .LBB14_8
-; AVX1-NEXT:  .LBB14_7: # %cond.store5
-; AVX1-NEXT:    vpextrb $3, %xmm0, 3(%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: truncstore_v4i32_v4i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [127,127,127,127]
-; AVX2-NEXT:    vpminsd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT:    vpmaxsd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
-; AVX2-NEXT:    testb $1, %al
-; AVX2-NEXT:    jne .LBB14_1
-; AVX2-NEXT:  # %bb.2: # %else
-; AVX2-NEXT:    testb $2, %al
-; AVX2-NEXT:    jne .LBB14_3
-; AVX2-NEXT:  .LBB14_4: # %else2
-; AVX2-NEXT:    testb $4, %al
-; AVX2-NEXT:    jne .LBB14_5
-; AVX2-NEXT:  .LBB14_6: # %else4
-; AVX2-NEXT:    testb $8, %al
-; AVX2-NEXT:    jne .LBB14_7
-; AVX2-NEXT:  .LBB14_8: # %else6
-; AVX2-NEXT:    retq
-; AVX2-NEXT:  .LBB14_1: # %cond.store
-; AVX2-NEXT:    vpextrb $0, %xmm0, (%rdi)
-; AVX2-NEXT:    testb $2, %al
-; AVX2-NEXT:    je .LBB14_4
-; AVX2-NEXT:  .LBB14_3: # %cond.store1
-; AVX2-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
-; AVX2-NEXT:    testb $4, %al
-; AVX2-NEXT:    je .LBB14_6
-; AVX2-NEXT:  .LBB14_5: # %cond.store3
-; AVX2-NEXT:    vpextrb $2, %xmm0, 2(%rdi)
-; AVX2-NEXT:    testb $8, %al
-; AVX2-NEXT:    je .LBB14_8
-; AVX2-NEXT:  .LBB14_7: # %cond.store5
-; AVX2-NEXT:    vpextrb $3, %xmm0, 3(%rdi)
-; AVX2-NEXT:    retq
+; AVX-LABEL: truncstore_v4i32_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vmovmskps %xmm1, %eax
+; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    testb $1, %al
+; AVX-NEXT:    jne .LBB14_1
+; AVX-NEXT:  # %bb.2: # %else
+; AVX-NEXT:    testb $2, %al
+; AVX-NEXT:    jne .LBB14_3
+; AVX-NEXT:  .LBB14_4: # %else2
+; AVX-NEXT:    testb $4, %al
+; AVX-NEXT:    jne .LBB14_5
+; AVX-NEXT:  .LBB14_6: # %else4
+; AVX-NEXT:    testb $8, %al
+; AVX-NEXT:    jne .LBB14_7
+; AVX-NEXT:  .LBB14_8: # %else6
+; AVX-NEXT:    retq
+; AVX-NEXT:  .LBB14_1: # %cond.store
+; AVX-NEXT:    vpextrb $0, %xmm0, (%rdi)
+; AVX-NEXT:    testb $2, %al
+; AVX-NEXT:    je .LBB14_4
+; AVX-NEXT:  .LBB14_3: # %cond.store1
+; AVX-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
+; AVX-NEXT:    testb $4, %al
+; AVX-NEXT:    je .LBB14_6
+; AVX-NEXT:  .LBB14_5: # %cond.store3
+; AVX-NEXT:    vpextrb $2, %xmm0, 2(%rdi)
+; AVX-NEXT:    testb $8, %al
+; AVX-NEXT:    je .LBB14_8
+; AVX-NEXT:  .LBB14_7: # %cond.store5
+; AVX-NEXT:    vpextrb $3, %xmm0, 3(%rdi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: truncstore_v4i32_v4i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    vpmovsdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    testb $1, %al
 ; AVX512F-NEXT:    jne .LBB14_1
@@ -5376,11 +5319,11 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX512BW-LABEL: truncstore_v4i32_v4i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; AVX512BW-NEXT:    kshiftlq $60, %k0, %k0
 ; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
-; AVX512BW-NEXT:    vpmovsdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index a94104a002d5ce4..f93f5682df826bf 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -4042,94 +4042,28 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
 }
 
 define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"="256" {
-; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: trunc_packus_v4i32_v4i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
-; SSE41-NEXT:    packusdw %xmm0, %xmm0
-; SSE41-NEXT:    packuswb %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: trunc_packus_v4i32_v4i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-SLOW-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-FAST-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_packus_v4i32_v4i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovusdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_packus_v4i32_v4i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpmovusdb %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; SSE-LABEL: trunc_packus_v4i32_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX512BW-LABEL: trunc_packus_v4i32_v4i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpmovusdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX-LABEL: trunc_packus_v4i32_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
-; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpmovusdb %xmm0, %xmm0
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: trunc_packus_v4i32_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_packus_v4i32_v4i8:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; SKX-NEXT:    vpmovusdb %xmm0, %xmm0
+; SKX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; SKX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; SKX-NEXT:    retq
   %1 = icmp slt <4 x i32> %a0, <i32 255, i32 255, i32 255, i32 255>
   %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 255, i32 255, i32 255, i32 255>
@@ -4140,71 +4074,25 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"
 }
 
 define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
-; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8_store:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    movd %xmm1, (%rdi)
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: trunc_packus_v4i32_v4i8_store:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
-; SSE41-NEXT:    packusdw %xmm1, %xmm1
-; SSE41-NEXT:    packuswb %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: trunc_packus_v4i32_v4i8_store:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8_store:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-SLOW-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rdi)
-; AVX2-SLOW-NEXT:    retq
+; SSE-LABEL: trunc_packus_v4i32_v4i8_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    movd %xmm0, (%rdi)
+; SSE-NEXT:    retq
 ;
-; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8_store:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-FAST-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovd %xmm0, (%rdi)
-; AVX2-FAST-NEXT:    retq
+; AVX-LABEL: trunc_packus_v4i32_v4i8_store:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, (%rdi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovusdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, (%rdi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_packus_v4i32_v4i8_store:
@@ -4216,11 +4104,9 @@ define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
 ;
 ; AVX512BW-LABEL: trunc_packus_v4i32_v4i8_store:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpmovusdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rdi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8_store:

diff  --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 2f3fdeb74dc473e..14f724fc3b8c79d 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -3777,86 +3777,28 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
 }
 
 define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) {
-; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    packssdw %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    packsswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: trunc_ssat_v4i32_v4i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    packssdw %xmm0, %xmm0
-; SSE41-NEXT:    packsswb %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-SLOW-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-SLOW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-FAST-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-FAST-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpmovsdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsdb %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; SSE-LABEL: trunc_ssat_v4i32_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpmovsdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX-LABEL: trunc_ssat_v4i32_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
-; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovsdb %xmm0, %xmm0
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: trunc_ssat_v4i32_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_ssat_v4i32_v4i8:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsdb %xmm0, %xmm0
+; SKX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; SKX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; SKX-NEXT:    retq
   %1 = icmp slt <4 x i32> %a0, <i32 127, i32 127, i32 127, i32 127>
   %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 127, i32 127, i32 127, i32 127>
@@ -3867,70 +3809,25 @@ define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) {
 }
 
 define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
-; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8_store:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    packssdw %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    packsswb %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    movd %xmm1, (%rdi)
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: trunc_ssat_v4i32_v4i8_store:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    packssdw %xmm0, %xmm0
-; SSE41-NEXT:    packsswb %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: trunc_ssat_v4i32_v4i8_store:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_ssat_v4i32_v4i8_store:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-SLOW-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-SLOW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rdi)
-; AVX2-SLOW-NEXT:    retq
+; SSE-LABEL: trunc_ssat_v4i32_v4i8_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    movd %xmm0, (%rdi)
+; SSE-NEXT:    retq
 ;
-; AVX2-FAST-LABEL: trunc_ssat_v4i32_v4i8_store:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-FAST-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-FAST-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovd %xmm0, (%rdi)
-; AVX2-FAST-NEXT:    retq
+; AVX-LABEL: trunc_ssat_v4i32_v4i8_store:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, (%rdi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_ssat_v4i32_v4i8_store:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpmovsdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, (%rdi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_ssat_v4i32_v4i8_store:
@@ -3940,10 +3837,9 @@ define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
 ;
 ; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8_store:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpmovsdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rdi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i8_store:


        


More information about the llvm-commits mailing list