[llvm] r334276 - [X86][SSE] Consistently prefer lowering to PACKUS over PACKSS
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 8 03:29:00 PDT 2018
Author: rksimon
Date: Fri Jun 8 03:29:00 2018
New Revision: 334276
URL: http://llvm.org/viewvc/llvm-project?rev=334276&view=rev
Log:
[X86][SSE] Consistently prefer lowering to PACKUS over PACKSS
We have some combines/lowerings that attempt to use PACKSS-then-PACKUS and others that use PACKUS-then-PACKSS.
PACKUS is much easier to combine with if we know the upper bits are zero as ComputeKnownBits can easily see through BITCASTs etc. especially now that rL333995 and rL334007 have landed. It also effectively works at byte level which further simplifies shuffle combines.
The only (minor) annoyances are that ComputeKnownBits can sometimes take longer as it doesn't fail as quickly as ComputeNumSignBits (but I'm not seeing any actual regressions in tests) and PACKUSDW only became available after SSE41 so we have more codegen diffs between targets.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avg.ll
llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll
llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll
llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll
llvm/trunk/test/CodeGen/X86/vector-trunc.ll
llvm/trunk/test/CodeGen/X86/widen_arith-2.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Jun 8 03:29:00 2018
@@ -9404,15 +9404,6 @@ static bool matchVectorShuffleWithPACK(M
auto MatchPACK = [&](SDValue N1, SDValue N2) {
SDValue VV1 = DAG.getBitcast(PackVT, N1);
SDValue VV2 = DAG.getBitcast(PackVT, N2);
- if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
- (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
- V1 = VV1;
- V2 = VV2;
- SrcVT = PackVT;
- PackOpcode = X86ISD::PACKSS;
- return true;
- }
-
if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
@@ -9424,7 +9415,14 @@ static bool matchVectorShuffleWithPACK(M
return true;
}
}
-
+ if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
+ (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
+ V1 = VV1;
+ V2 = VV2;
+ SrcVT = PackVT;
+ PackOpcode = X86ISD::PACKSS;
+ return true;
+ }
return false;
};
@@ -16897,8 +16895,8 @@ static SDValue truncateVectorWithPACK(un
if (SrcVT.is128BitVector()) {
InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
- SDValue Res = DAG.getNode(Opcode, DL, OutVT,
- DAG.getBitcast(InVT, In), DAG.getUNDEF(InVT));
+ In = DAG.getBitcast(InVT, In);
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
Res = extractSubVector(Res, 0, DAG, DL, 64);
return DAG.getBitcast(DstVT, Res);
}
@@ -17057,25 +17055,26 @@ SDValue X86TargetLowering::LowerTRUNCATE
}
}
- // Truncate with PACKSS if we are truncating a vector with sign-bits that
- // extend all the way to the packed/truncated value.
- unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
- if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
- if (SDValue V =
- truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
- return V;
+ unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
+ unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
// Truncate with PACKUS if we are truncating a vector with leading zero bits
// that extend all the way to the packed/truncated value.
// Pre-SSE41 we can only use PACKUSWB.
KnownBits Known;
DAG.computeKnownBits(In, Known);
- NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
- if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
+ if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
if (SDValue V =
truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
return V;
+ // Truncate with PACKSS if we are truncating a vector with sign-bits that
+ // extend all the way to the packed/truncated value.
+ if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
+ if (SDValue V =
+ truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
+ return V;
+
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
// On AVX2, v4i64 -> v4i32 becomes VPERMD.
if (Subtarget.hasInt256()) {
@@ -35282,9 +35281,6 @@ static SDValue combineTruncateWithSat(SD
if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
(SVT == MVT::i8 || SVT == MVT::i16) &&
(InSVT == MVT::i16 || InSVT == MVT::i32)) {
- if (auto SSatVal = detectSSatPattern(In, VT))
- return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
- Subtarget);
if (auto USatVal = detectSSatPattern(In, VT, true)) {
// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
if (SVT == MVT::i8 && InSVT == MVT::i32) {
@@ -35299,6 +35295,9 @@ static SDValue combineTruncateWithSat(SD
return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
Subtarget);
}
+ if (auto SSatVal = detectSSatPattern(In, VT))
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
+ Subtarget);
}
return SDValue();
}
@@ -36530,22 +36529,23 @@ static SDValue combineVectorSignBitsTrun
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
- // Use PACKSS if the input has sign-bits that extend all the way to the
- // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
- unsigned NumSignBits = DAG.ComputeNumSignBits(In);
- unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
- if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
- return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+ unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
+ unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
// Use PACKUS if the input has zero-bits that extend all the way to the
// packed/truncated value. e.g. masks, zext_in_reg, etc.
KnownBits Known;
DAG.computeKnownBits(In, Known);
unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
- NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
- if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
+ if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
+ // Use PACKSS if the input has sign-bits that extend all the way to the
+ // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
+ unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+ if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+
return SDValue();
}
Modified: llvm/trunk/test/CodeGen/X86/avg.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avg.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avg.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avg.ll Fri Jun 8 03:29:00 2018
@@ -347,22 +347,22 @@ define void @avg_v48i8(<48 x i8>* %a, <4
; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsrld $1, %xmm6, %xmm2
; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4
-; AVX1-NEXT: vpackssdw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpackusdw %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpsrld $1, %xmm5, %xmm3
; AVX1-NEXT: vpsrld $1, %xmm12, %xmm4
-; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpsrld $1, %xmm11, %xmm4
; AVX1-NEXT: vpsrld $1, %xmm8, %xmm5
-; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vpackusdw %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpsrld $1, %xmm9, %xmm5
; AVX1-NEXT: vpsrld $1, %xmm10, %xmm6
-; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpshufb %xmm6, %xmm4, %xmm4
@@ -434,26 +434,26 @@ define void @avg_v48i8(<48 x i8>* %a, <4
; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpackusdw %xmm6, %xmm4, %xmm4
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
-; AVX2-NEXT: vpackssdw %xmm7, %xmm1, %xmm1
+; AVX2-NEXT: vpackusdw %xmm7, %xmm1, %xmm1
; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm5, %xmm2
+; AVX2-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
; AVX2-NEXT: vmovdqu %xmm0, (%rax)
Modified: llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll (original)
+++ llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll Fri Jun 8 03:29:00 2018
@@ -180,7 +180,7 @@ define <16 x i8> @testv16i1_zext_v16i8(<
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
; AVX256-NEXT: vpsrlw $15, %xmm0, %xmm0
-; AVX256-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX256-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX256-NEXT: vzeroupper
; AVX256-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-packus.ll Fri Jun 8 03:29:00 2018
@@ -1415,7 +1415,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE2-NEXT: por %xmm7, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm1, %xmm0
; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: pxor %xmm10, %xmm1
; SSE2-NEXT: movdqa %xmm1, %xmm3
@@ -1438,8 +1438,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSE2-NEXT: por %xmm1, %xmm2
; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: packssdw %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm3, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
@@ -1521,7 +1521,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSSE3-NEXT: por %xmm7, %xmm0
; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
+; SSSE3-NEXT: packuswb %xmm1, %xmm0
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: pxor %xmm10, %xmm1
; SSSE3-NEXT: movdqa %xmm1, %xmm3
@@ -1544,8 +1544,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; SSSE3-NEXT: por %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSSE3-NEXT: packssdw %xmm2, %xmm0
+; SSSE3-NEXT: packuswb %xmm3, %xmm2
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i64_v8i8:
@@ -1626,7 +1626,7 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: packssdw %xmm5, %xmm1
+; SSE41-NEXT: packusdw %xmm5, %xmm1
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: xorpd %xmm10, %xmm0
; SSE41-NEXT: movapd %xmm0, %xmm4
@@ -1650,8 +1650,8 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
-; SSE41-NEXT: packssdw %xmm4, %xmm3
-; SSE41-NEXT: packssdw %xmm3, %xmm1
+; SSE41-NEXT: packusdw %xmm4, %xmm3
+; SSE41-NEXT: packusdw %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
;
@@ -1678,11 +1678,11 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2
; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm2
; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1698,10 +1698,10 @@ define <8 x i8> @trunc_packus_v8i64_v8i8
; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2109,11 +2109,11 @@ define void @trunc_packus_v8i64_v8i8_sto
; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -2676,7 +2676,7 @@ define <16 x i8> @trunc_packus_v16i64_v1
; SSE41-NEXT: por %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: packssdw %xmm4, %xmm1
+; SSE41-NEXT: packusdw %xmm4, %xmm1
; SSE41-NEXT: movapd %xmm6, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm0, %xmm2
@@ -2701,7 +2701,7 @@ define <16 x i8> @trunc_packus_v16i64_v1
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm3
-; SSE41-NEXT: packssdw %xmm2, %xmm3
+; SSE41-NEXT: packusdw %xmm2, %xmm3
; SSE41-NEXT: packusdw %xmm3, %xmm1
; SSE41-NEXT: movapd %xmm15, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
@@ -2727,7 +2727,7 @@ define <16 x i8> @trunc_packus_v16i64_v1
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm3
-; SSE41-NEXT: packssdw %xmm2, %xmm3
+; SSE41-NEXT: packusdw %xmm2, %xmm3
; SSE41-NEXT: movapd %xmm13, %xmm0
; SSE41-NEXT: xorpd %xmm9, %xmm0
; SSE41-NEXT: movapd %xmm0, %xmm2
@@ -2751,7 +2751,7 @@ define <16 x i8> @trunc_packus_v16i64_v1
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm8
-; SSE41-NEXT: packssdw %xmm2, %xmm8
+; SSE41-NEXT: packusdw %xmm2, %xmm8
; SSE41-NEXT: packusdw %xmm8, %xmm3
; SSE41-NEXT: packuswb %xmm3, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
@@ -2796,17 +2796,17 @@ define <16 x i8> @trunc_packus_v16i64_v1
; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm4
; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpand %xmm15, %xmm6, %xmm4
-; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpackusdw %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpand %xmm5, %xmm13, %xmm4
; AVX1-NEXT: vpand %xmm2, %xmm12, %xmm2
-; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpand %xmm7, %xmm11, %xmm3
; AVX1-NEXT: vpand %xmm1, %xmm10, %xmm1
-; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpand %xmm14, %xmm9, %xmm3
; AVX1-NEXT: vpand %xmm0, %xmm8, %xmm0
-; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
@@ -2886,7 +2886,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
; SSE2-NEXT: movdqa %xmm3, %xmm0
; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm2, %xmm0
+; SSE2-NEXT: packuswb %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_packus_v8i32_v8i8:
@@ -2909,7 +2909,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
; SSSE3-NEXT: movdqa %xmm3, %xmm0
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0
; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: packssdw %xmm2, %xmm0
+; SSSE3-NEXT: packuswb %xmm2, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v8i32_v8i8:
@@ -2920,7 +2920,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
; SSE41-NEXT: pxor %xmm2, %xmm2
; SSE41-NEXT: pmaxsd %xmm2, %xmm1
; SSE41-NEXT: pmaxsd %xmm2, %xmm0
-; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc_packus_v8i32_v8i8:
@@ -2932,7 +2932,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -2943,7 +2943,7 @@ define <8 x i8> @trunc_packus_v8i32_v8i8
; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc-usat.ll Fri Jun 8 03:29:00 2018
@@ -1114,7 +1114,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
; SSE41-NEXT: por %xmm7, %xmm0
; SSE41-NEXT: movapd %xmm5, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
-; SSE41-NEXT: packssdw %xmm4, %xmm1
+; SSE41-NEXT: packusdw %xmm4, %xmm1
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm6, %xmm0
; SSE41-NEXT: movdqa %xmm9, %xmm4
@@ -1137,7 +1137,7 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: por %xmm6, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
-; SSE41-NEXT: packssdw %xmm4, %xmm5
+; SSE41-NEXT: packusdw %xmm4, %xmm5
; SSE41-NEXT: packusdw %xmm5, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm0
; SSE41-NEXT: retq
@@ -1162,9 +1162,9 @@ define <8 x i8> @trunc_usat_v8i64_v8i8(<
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1443,11 +1443,11 @@ define void @trunc_usat_v8i64_v8i8_store
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
@@ -1732,7 +1732,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i
; SSE41-NEXT: por %xmm14, %xmm0
; SSE41-NEXT: movapd %xmm9, %xmm13
; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm13
-; SSE41-NEXT: packssdw %xmm12, %xmm13
+; SSE41-NEXT: packusdw %xmm12, %xmm13
; SSE41-NEXT: movdqa %xmm3, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm11, %xmm1
@@ -1757,7 +1757,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i
; SSE41-NEXT: por %xmm1, %xmm0
; SSE41-NEXT: movapd %xmm9, %xmm1
; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: packssdw %xmm12, %xmm1
+; SSE41-NEXT: packusdw %xmm12, %xmm1
; SSE41-NEXT: packusdw %xmm1, %xmm13
; SSE41-NEXT: movdqa %xmm5, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
@@ -1783,7 +1783,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i
; SSE41-NEXT: por %xmm5, %xmm0
; SSE41-NEXT: movapd %xmm9, %xmm2
; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: packssdw %xmm1, %xmm2
+; SSE41-NEXT: packusdw %xmm1, %xmm2
; SSE41-NEXT: movdqa %xmm7, %xmm0
; SSE41-NEXT: pxor %xmm10, %xmm0
; SSE41-NEXT: movdqa %xmm11, %xmm1
@@ -1806,7 +1806,7 @@ define <16 x i8> @trunc_usat_v16i64_v16i
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE41-NEXT: por %xmm4, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9
-; SSE41-NEXT: packssdw %xmm1, %xmm9
+; SSE41-NEXT: packusdw %xmm1, %xmm9
; SSE41-NEXT: packusdw %xmm9, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm13
; SSE41-NEXT: movdqa %xmm13, %xmm0
@@ -1846,14 +1846,14 @@ define <16 x i8> @trunc_usat_v16i64_v16i
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
; AVX1-NEXT: vblendvpd %ymm4, %ymm3, %ymm8, %ymm3
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/vector-trunc.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-trunc.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-trunc.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-trunc.ll Fri Jun 8 03:29:00 2018
@@ -974,10 +974,10 @@ define void @trunc16i32_16i8_lshr(<16 x
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: psrld $24, %xmm1
; SSE41-NEXT: psrld $24, %xmm0
-; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: packusdw %xmm1, %xmm0
; SSE41-NEXT: psrld $24, %xmm3
; SSE41-NEXT: psrld $24, %xmm2
-; SSE41-NEXT: packssdw %xmm3, %xmm2
+; SSE41-NEXT: packusdw %xmm3, %xmm2
; SSE41-NEXT: packuswb %xmm2, %xmm0
; SSE41-NEXT: movdqu %xmm0, (%rax)
; SSE41-NEXT: retq
@@ -987,11 +987,11 @@ define void @trunc16i32_16i8_lshr(<16 x
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqu %xmm0, (%rax)
; AVX1-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/widen_arith-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/widen_arith-2.ll?rev=334276&r1=334275&r2=334276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/widen_arith-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/widen_arith-2.ll Fri Jun 8 03:29:00 2018
@@ -24,7 +24,7 @@ define void @update(i64* %dst_i, i64* %s
; CHECK-NEXT: pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
; CHECK-NEXT: psubw %xmm0, %xmm2
; CHECK-NEXT: pand %xmm1, %xmm2
-; CHECK-NEXT: packsswb %xmm0, %xmm2
+; CHECK-NEXT: packuswb %xmm0, %xmm2
; CHECK-NEXT: movq %xmm2, (%edx,%eax,8)
; CHECK-NEXT: incl (%esp)
; CHECK-NEXT: .LBB0_1: # %forcond
More information about the llvm-commits
mailing list