[llvm] r334276 - [X86][SSE] Consistently prefer lowering to PACKUS over PACKSS

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 8 03:29:00 PDT 2018


Author: rksimon
Date: Fri Jun  8 03:29:00 2018
New Revision: 334276

URL: http://llvm.org/viewvc/llvm-project?rev=334276&view=rev
Log:
[X86][SSE] Consistently prefer lowering to PACKUS over PACKSS

We have some combines/lowerings that attempt to use PACKSS-then-PACKUS and others that use PACKUS-then-PACKSS.

PACKUS is much easier to combine with if we know the upper bits are zero as ComputeKnownBits can easily see through BITCASTs etc. especially now that rL333995 and rL334007 have landed. It also effectively works at byte level which further simplifies shuffle combines.

The only (minor) annoyances are that ComputeKnownBits can sometimes take longer as it doesn't fail as quickly as ComputeNumSignBits (but I'm not seeing any actual regressions in tests) and PACKUSDW only became available after SSE41 so we have more codegen diffs between targets.

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/test/CodeGen/X86/avg.ll
    llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll
    llvm/trunk/test/CodeGen/X86/vector-trunc.ll
    llvm/trunk/test/CodeGen/X86/widen_arith-2.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jun  8 03:29:00 2018
@@ -9404,15 +9404,6 @@ static bool matchVectorShuffleWithPACK(M
   auto MatchPACK = [&](SDValue N1, SDValue N2) {
     SDValue VV1 = DAG.getBitcast(PackVT, N1);
     SDValue VV2 = DAG.getBitcast(PackVT, N2);
-    if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
-        (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
-      V1 = VV1;
-      V2 = VV2;
-      SrcVT = PackVT;
-      PackOpcode = X86ISD::PACKSS;
-      return true;
-    }
-
     if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
       APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
       if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
@@ -9424,7 +9415,14 @@ static bool matchVectorShuffleWithPACK(M
         return true;
       }
     }
-
+    if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
+        (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
+      V1 = VV1;
+      V2 = VV2;
+      SrcVT = PackVT;
+      PackOpcode = X86ISD::PACKSS;
+      return true;
+    }
     return false;
   };
 
@@ -16897,8 +16895,8 @@ static SDValue truncateVectorWithPACK(un
   if (SrcVT.is128BitVector()) {
     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
-    SDValue Res = DAG.getNode(Opcode, DL, OutVT,
-                              DAG.getBitcast(InVT, In), DAG.getUNDEF(InVT));
+    In = DAG.getBitcast(InVT, In);
+    SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
     Res = extractSubVector(Res, 0, DAG, DL, 64);
     return DAG.getBitcast(DstVT, Res);
   }
@@ -17057,25 +17055,26 @@ SDValue X86TargetLowering::LowerTRUNCATE
     }
   }
 
-  // Truncate with PACKSS if we are truncating a vector with sign-bits that
-  // extend all the way to the packed/truncated value.
-  unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
-  if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
-    if (SDValue V =
-            truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
-      return V;
+  unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
+  unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
   // Truncate with PACKUS if we are truncating a vector with leading zero bits
   // that extend all the way to the packed/truncated value.
   // Pre-SSE41 we can only use PACKUSWB.
   KnownBits Known;
   DAG.computeKnownBits(In, Known);
-  NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
-  if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
+  if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
     if (SDValue V =
             truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
       return V;
 
+  // Truncate with PACKSS if we are truncating a vector with sign-bits that
+  // extend all the way to the packed/truncated value.
+  if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
+    if (SDValue V =
+            truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
+      return V;
+
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
@@ -35282,9 +35281,6 @@ static SDValue combineTruncateWithSat(SD
   if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
       (SVT == MVT::i8 || SVT == MVT::i16) &&
       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
-    if (auto SSatVal = detectSSatPattern(In, VT))
-      return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
-                                    Subtarget);
     if (auto USatVal = detectSSatPattern(In, VT, true)) {
       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
       if (SVT == MVT::i8 && InSVT == MVT::i32) {
@@ -35299,6 +35295,9 @@ static SDValue combineTruncateWithSat(SD
         return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
                                       Subtarget);
     }
+    if (auto SSatVal = detectSSatPattern(In, VT))
+      return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
+                                    Subtarget);
   }
   return SDValue();
 }
@@ -36530,22 +36529,23 @@ static SDValue combineVectorSignBitsTrun
   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
     return SDValue();
 
-  // Use PACKSS if the input has sign-bits that extend all the way to the
-  // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
-  unsigned NumSignBits = DAG.ComputeNumSignBits(In);
-  unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
-  if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
-    return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+  unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
+  unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
 
   // Use PACKUS if the input has zero-bits that extend all the way to the
   // packed/truncated value. e.g. masks, zext_in_reg, etc.
   KnownBits Known;
   DAG.computeKnownBits(In, Known);
   unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
-  NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
-  if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
+  if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
     return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
 
+  // Use PACKSS if the input has sign-bits that extend all the way to the
+  // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
+  unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+  if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
+    return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+
   return SDValue();
 }
 

Modified: llvm/trunk/test/CodeGen/X86/avg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avg.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avg.ll Fri Jun  8 03:29:00 2018
@@ -347,22 +347,22 @@ define void @avg_v48i8(<48 x i8>* %a, <4
 ; AVX1-NEXT:    vpsubd %xmm7, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrld $1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsrld $1, %xmm6, %xmm2
 ; AVX1-NEXT:    vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrld $1, %xmm5, %xmm3
 ; AVX1-NEXT:    vpsrld $1, %xmm12, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrld $1, %xmm11, %xmm4
 ; AVX1-NEXT:    vpsrld $1, %xmm8, %xmm5
-; AVX1-NEXT:    vpackssdw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpackusdw %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpsrld $1, %xmm9, %xmm5
 ; AVX1-NEXT:    vpsrld $1, %xmm10, %xmm6
-; AVX1-NEXT:    vpackssdw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpackusdw %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
@@ -434,26 +434,26 @@ define void @avg_v48i8(<48 x i8>* %a, <4
 ; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $1, %ymm4, %ymm4
 ; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
-; AVX2-NEXT:    vpackssdw %xmm6, %xmm4, %xmm4
+; AVX2-NEXT:    vpackusdw %xmm6, %xmm4, %xmm4
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
-; AVX2-NEXT:    vpackssdw %xmm7, %xmm1, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm7, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT:    vpackssdw %xmm4, %xmm2, %xmm2
+; AVX2-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm5, %xmm2
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm5, %xmm2
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
 ; AVX2-NEXT:    vmovdqu %xmm0, (%rax)

Modified: llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll (original)
+++ llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll Fri Jun  8 03:29:00 2018
@@ -180,7 +180,7 @@ define <16 x i8> @testv16i1_zext_v16i8(<
 ; AVX256-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; AVX256-NEXT:    vpmovdw %ymm0, %xmm0
 ; AVX256-NEXT:    vpsrlw $15, %xmm0, %xmm0
-; AVX256-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX256-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX256-NEXT:    vzeroupper
 ; AVX256-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll Fri Jun  8 03:29:00 2018
@@ -1415,7 +1415,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE2-NEXT:    por %xmm7, %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    packssdw %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:    pxor %xmm10, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
@@ -1438,8 +1438,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSE2-NEXT:    por %xmm1, %xmm2
 ; SSE2-NEXT:    pand %xmm5, %xmm2
-; SSE2-NEXT:    packssdw %xmm3, %xmm2
-; SSE2-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm3, %xmm2
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
@@ -1521,7 +1521,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm7, %xmm0
 ; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    packssdw %xmm1, %xmm0
+; SSSE3-NEXT:    packuswb %xmm1, %xmm0
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm1
 ; SSSE3-NEXT:    pxor %xmm10, %xmm1
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm3
@@ -1544,8 +1544,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
 ; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; SSSE3-NEXT:    por %xmm1, %xmm2
 ; SSSE3-NEXT:    pand %xmm5, %xmm2
-; SSSE3-NEXT:    packssdw %xmm3, %xmm2
-; SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSSE3-NEXT:    packuswb %xmm3, %xmm2
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_packus_v8i64_v8i8:
@@ -1626,7 +1626,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
 ; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT:    packssdw %xmm5, %xmm1
+; SSE41-NEXT:    packusdw %xmm5, %xmm1
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    xorpd %xmm10, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm4
@@ -1650,8 +1650,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm3
-; SSE41-NEXT:    packssdw %xmm4, %xmm3
-; SSE41-NEXT:    packssdw %xmm3, %xmm1
+; SSE41-NEXT:    packusdw %xmm4, %xmm3
+; SSE41-NEXT:    packusdw %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1678,11 +1678,11 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm7, %xmm2
 ; AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand %xmm1, %xmm6, %xmm1
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm2
 ; AVX1-NEXT:    vpand %xmm0, %xmm3, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -1698,10 +1698,10 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
 ; AVX2-NEXT:    vpand %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vpand %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
@@ -2109,11 +2109,11 @@ define void @trunc_packus_v8i64_v8i8_sto
 ; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm2
 ; AVX2-NEXT:    vpand %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -2676,7 +2676,7 @@ define <16 x i8> @trunc_packus_v16i64_v1
 ; SSE41-NEXT:    por %xmm2, %xmm0
 ; SSE41-NEXT:    pxor %xmm1, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT:    packssdw %xmm4, %xmm1
+; SSE41-NEXT:    packusdw %xmm4, %xmm1
 ; SSE41-NEXT:    movapd %xmm6, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm2
@@ -2701,7 +2701,7 @@ define <16 x i8> @trunc_packus_v16i64_v1
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm3
-; SSE41-NEXT:    packssdw %xmm2, %xmm3
+; SSE41-NEXT:    packusdw %xmm2, %xmm3
 ; SSE41-NEXT:    packusdw %xmm3, %xmm1
 ; SSE41-NEXT:    movapd %xmm15, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
@@ -2727,7 +2727,7 @@ define <16 x i8> @trunc_packus_v16i64_v1
 ; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm14, %xmm3
-; SSE41-NEXT:    packssdw %xmm2, %xmm3
+; SSE41-NEXT:    packusdw %xmm2, %xmm3
 ; SSE41-NEXT:    movapd %xmm13, %xmm0
 ; SSE41-NEXT:    xorpd %xmm9, %xmm0
 ; SSE41-NEXT:    movapd %xmm0, %xmm2
@@ -2751,7 +2751,7 @@ define <16 x i8> @trunc_packus_v16i64_v1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm10, %xmm8
-; SSE41-NEXT:    packssdw %xmm2, %xmm8
+; SSE41-NEXT:    packusdw %xmm2, %xmm8
 ; SSE41-NEXT:    packusdw %xmm8, %xmm3
 ; SSE41-NEXT:    packuswb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -2796,17 +2796,17 @@ define <16 x i8> @trunc_packus_v16i64_v1
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm4
 ; AVX1-NEXT:    vpand %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpand %xmm15, %xmm6, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpand %xmm5, %xmm13, %xmm4
 ; AVX1-NEXT:    vpand %xmm2, %xmm12, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand %xmm7, %xmm11, %xmm3
 ; AVX1-NEXT:    vpand %xmm1, %xmm10, %xmm1
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm14, %xmm9, %xmm3
 ; AVX1-NEXT:    vpand %xmm0, %xmm8, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
@@ -2886,7 +2886,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
 ; SSE2-NEXT:    movdqa %xmm3, %xmm0
 ; SSE2-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSE2-NEXT:    pand %xmm3, %xmm0
-; SSE2-NEXT:    packssdw %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: trunc_packus_v8i32_v8i8:
@@ -2909,7 +2909,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm0
 ; SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
 ; SSSE3-NEXT:    pand %xmm3, %xmm0
-; SSSE3-NEXT:    packssdw %xmm2, %xmm0
+; SSSE3-NEXT:    packuswb %xmm2, %xmm0
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: trunc_packus_v8i32_v8i8:
@@ -2920,7 +2920,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
 ; SSE41-NEXT:    pxor %xmm2, %xmm2
 ; SSE41-NEXT:    pmaxsd %xmm2, %xmm1
 ; SSE41-NEXT:    pmaxsd %xmm2, %xmm0
-; SSE41-NEXT:    packssdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: trunc_packus_v8i32_v8i8:
@@ -2932,7 +2932,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpmaxsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpackssdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
@@ -2943,7 +2943,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll Fri Jun  8 03:29:00 2018
@@ -1114,7 +1114,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
 ; SSE41-NEXT:    por %xmm7, %xmm0
 ; SSE41-NEXT:    movapd %xmm5, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm1
-; SSE41-NEXT:    packssdw %xmm4, %xmm1
+; SSE41-NEXT:    packusdw %xmm4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm6, %xmm0
 ; SSE41-NEXT:    movdqa %xmm9, %xmm4
@@ -1137,7 +1137,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; SSE41-NEXT:    packssdw %xmm4, %xmm5
+; SSE41-NEXT:    packusdw %xmm4, %xmm5
 ; SSE41-NEXT:    packusdw %xmm5, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -1162,9 +1162,9 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
@@ -1443,11 +1443,11 @@ define void @trunc_usat_v8i64_v8i8_store
 ; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm5, %ymm3
 ; AVX2-NEXT:    vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -1732,7 +1732,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i
 ; SSE41-NEXT:    por %xmm14, %xmm0
 ; SSE41-NEXT:    movapd %xmm9, %xmm13
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm13
-; SSE41-NEXT:    packssdw %xmm12, %xmm13
+; SSE41-NEXT:    packusdw %xmm12, %xmm13
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
 ; SSE41-NEXT:    movdqa %xmm11, %xmm1
@@ -1757,7 +1757,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i
 ; SSE41-NEXT:    por %xmm1, %xmm0
 ; SSE41-NEXT:    movapd %xmm9, %xmm1
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT:    packssdw %xmm12, %xmm1
+; SSE41-NEXT:    packusdw %xmm12, %xmm1
 ; SSE41-NEXT:    packusdw %xmm1, %xmm13
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
@@ -1783,7 +1783,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i
 ; SSE41-NEXT:    por %xmm5, %xmm0
 ; SSE41-NEXT:    movapd %xmm9, %xmm2
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT:    packssdw %xmm1, %xmm2
+; SSE41-NEXT:    packusdw %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm7, %xmm0
 ; SSE41-NEXT:    pxor %xmm10, %xmm0
 ; SSE41-NEXT:    movdqa %xmm11, %xmm1
@@ -1806,7 +1806,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; SSE41-NEXT:    por %xmm4, %xmm0
 ; SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm9
-; SSE41-NEXT:    packssdw %xmm1, %xmm9
+; SSE41-NEXT:    packusdw %xmm1, %xmm9
 ; SSE41-NEXT:    packusdw %xmm9, %xmm2
 ; SSE41-NEXT:    packuswb %xmm2, %xmm13
 ; SSE41-NEXT:    movdqa %xmm13, %xmm0
@@ -1846,14 +1846,14 @@ define <16 x i8> @trunc_usat_v16i64_v16i
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
 ; AVX1-NEXT:    vblendvpd %ymm4, %ymm3, %ymm8, %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vpackssdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper

Modified: llvm/trunk/test/CodeGen/X86/vector-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc.ll Fri Jun  8 03:29:00 2018
@@ -974,10 +974,10 @@ define void @trunc16i32_16i8_lshr(<16 x
 ; SSE41:       # %bb.0: # %entry
 ; SSE41-NEXT:    psrld $24, %xmm1
 ; SSE41-NEXT:    psrld $24, %xmm0
-; SSE41-NEXT:    packssdw %xmm1, %xmm0
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
 ; SSE41-NEXT:    psrld $24, %xmm3
 ; SSE41-NEXT:    psrld $24, %xmm2
-; SSE41-NEXT:    packssdw %xmm3, %xmm2
+; SSE41-NEXT:    packusdw %xmm3, %xmm2
 ; SSE41-NEXT:    packuswb %xmm2, %xmm0
 ; SSE41-NEXT:    movdqu %xmm0, (%rax)
 ; SSE41-NEXT:    retq
@@ -987,11 +987,11 @@ define void @trunc16i32_16i8_lshr(<16 x
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrld $24, %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpsrld $24, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm1
-; AVX1-NEXT:    vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
 ; AVX1-NEXT:    vzeroupper

Modified: llvm/trunk/test/CodeGen/X86/widen_arith-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_arith-2.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_arith-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_arith-2.ll Fri Jun  8 03:29:00 2018
@@ -24,7 +24,7 @@ define void @update(i64* %dst_i, i64* %s
 ; CHECK-NEXT:    pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; CHECK-NEXT:    psubw %xmm0, %xmm2
 ; CHECK-NEXT:    pand %xmm1, %xmm2
-; CHECK-NEXT:    packsswb %xmm0, %xmm2
+; CHECK-NEXT:    packuswb %xmm0, %xmm2
 ; CHECK-NEXT:    movq %xmm2, (%edx,%eax,8)
 ; CHECK-NEXT:    incl (%esp)
 ; CHECK-NEXT:  .LBB0_1: # %forcond




More information about the llvm-commits mailing list