[llvm] e229086 - [X86] ReplaceNodeResults - widen sub-128-bit vector truncations if it would allow them to use PACKSS/PACKUS
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 4 09:36:30 PDT 2023
Author: Simon Pilgrim
Date: 2023-08-04T17:36:19+01:00
New Revision: e22908692c9b50ed5a96fec9d402f3fb099f7ffd
URL: https://github.com/llvm/llvm-project/commit/e22908692c9b50ed5a96fec9d402f3fb099f7ffd
DIFF: https://github.com/llvm/llvm-project/commit/e22908692c9b50ed5a96fec9d402f3fb099f7ffd.diff
LOG: [X86] ReplaceNodeResults - widen sub-128-bit vector truncations if it would allow them to use PACKSS/PACKUS
We currently just scalarize sub-128-bit vector truncations, but if the input vector has sufficient signbits/zerobits then we should try to use PACKSS/PACKUS with a widened vector with don't care upper elements. Shuffle lowering will struggle to detect this if we wait until the scalarization has been revectorized as a shuffle.
Another step towards issue #63710
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/test/CodeGen/X86/buildvec-insertvec.ll
llvm/test/CodeGen/X86/fpclamptosat_vec.ll
llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
llvm/test/CodeGen/X86/pmulh.ll
llvm/test/CodeGen/X86/vector-trunc-packus.ll
llvm/test/CodeGen/X86/vector-trunc-ssat.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 960688087cb996..c999f389bc5e23 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3716,6 +3716,12 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
return DAG.getBuildVector(ResultVT, dl,
Vec->ops().slice(IdxVal, ElemsPerChunk));
+ // Check if we're extracting the upper undef of a widening pattern.
+ if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
+ Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
+ isNullConstant(Vec.getOperand(2)))
+ return DAG.getUNDEF(ResultVT);
+
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
}
@@ -20016,6 +20022,14 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
SDValue Lo, Hi;
std::tie(Lo, Hi) = splitVector(In, DAG, DL);
+ // If Hi is undef, then don't bother packing it and widen the result instead.
+ if (Hi.isUndef()) {
+ EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
+ if (SDValue Res =
+ truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
+ return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
+ }
+
unsigned SubSizeInBits = SrcSizeInBits / 2;
InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
@@ -31974,9 +31988,45 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
EVT InEltVT = InVT.getVectorElementType();
EVT EltVT = VT.getVectorElementType();
unsigned WidenNumElts = WidenVT.getVectorNumElements();
-
unsigned InBits = InVT.getSizeInBits();
+
if (128 % InBits == 0) {
+ // See if we there are sufficient leading bits to perform a PACKUS/PACKSS.
+ // Skip for AVX512 unless this will be a single stage truncation.
+ if ((InEltVT == MVT::i16 || InEltVT == MVT::i32) &&
+ (EltVT == MVT::i8 || EltVT == MVT::i16) &&
+ (!Subtarget.hasAVX512() || InBits == (2 * VT.getSizeInBits()))) {
+ unsigned NumPackedSignBits =
+ std::min<unsigned>(EltVT.getSizeInBits(), 16);
+ unsigned NumPackedZeroBits =
+ Subtarget.hasSSE41() ? NumPackedSignBits : 8;
+
+ // Use PACKUS if the input has zero-bits that extend all the way to the
+ // packed/truncated value. e.g. masks, zext_in_reg, etc.
+ KnownBits Known = DAG.computeKnownBits(In);
+ unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
+ bool UsePACKUS =
+ NumLeadingZeroBits >= (InEltVT.getSizeInBits() - NumPackedZeroBits);
+
+ // Use PACKSS if the input has sign-bits that extend all the way to the
+ // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
+ unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+ bool UsePACKSS =
+ NumSignBits > (InEltVT.getSizeInBits() - NumPackedSignBits);
+
+ if (UsePACKUS || UsePACKSS) {
+ SDValue WidenIn =
+ widenSubVector(In, false, Subtarget, DAG, dl,
+ InEltVT.getSizeInBits() * WidenNumElts);
+ if (SDValue Res = truncateVectorWithPACK(
+ UsePACKUS ? X86ISD::PACKUS : X86ISD::PACKSS, WidenVT, WidenIn,
+ dl, DAG, Subtarget)) {
+ Results.push_back(Res);
+ return;
+ }
+ }
+ }
+
// 128 bit and smaller inputs should avoid truncate all together and
// just use a build_vector that will become a shuffle.
// TODO: Widen and use a shuffle directly?
@@ -31992,6 +32042,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
return;
}
+
// With AVX512 there are some cases that can use a target specific
// truncate node to go from 256/512 to less than 128 with zeros in the
// upper elements of the 128 bit result.
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index a41ed3d5e0b482..e4c62fca5bd57a 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1458,9 +1458,7 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; KNL-NEXT: cmovpl %ecx, %eax ## encoding: [0x0f,0x4a,0xc1]
; KNL-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
; KNL-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc4,0xc2,0x01]
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; KNL-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
-; KNL-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; KNL-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0]
; KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A]
; KNL-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; KNL-NEXT: vpextrw $0, %xmm0, (%rsi) ## encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00]
@@ -1492,9 +1490,7 @@ define void @half_vec_compare(ptr %x, ptr %y) {
; AVX512BW-NEXT: cmovpl %ecx, %eax ## encoding: [0x0f,0x4a,0xc1]
; AVX512BW-NEXT: vmovd %eax, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc0]
; AVX512BW-NEXT: vpinsrw $1, %edx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc2,0x01]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A]
-; AVX512BW-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc0]
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdb,0x05,A,A,A,A]
; AVX512BW-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x15,0x06,0x00]
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 2e846f900efcd7..5500ad33230431 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -8,14 +8,9 @@ define void @foo(<3 x float> %in, ptr nocapture %out) nounwind {
; SSE2-LABEL: foo:
; SSE2: # %bb.0:
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT: shll $8, %ecx
-; SSE2-NEXT: orl %eax, %ecx
-; SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT: shll $16, %eax
-; SSE2-NEXT: orl %ecx, %eax
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: packuswb %xmm0, %xmm0
+; SSE2-NEXT: movd %xmm0, %eax
; SSE2-NEXT: orl $-16777216, %eax # imm = 0xFF000000
; SSE2-NEXT: movl %eax, (%rdi)
; SSE2-NEXT: retq
@@ -23,7 +18,8 @@ define void @foo(<3 x float> %in, ptr nocapture %out) nounwind {
; SSE41-LABEL: foo:
; SSE41: # %bb.0:
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT: packusdw %xmm0, %xmm0
+; SSE41-NEXT: packuswb %xmm0, %xmm0
; SSE41-NEXT: movl $255, %eax
; SSE41-NEXT: pinsrb $3, %eax, %xmm0
; SSE41-NEXT: movd %xmm0, (%rdi)
@@ -32,7 +28,8 @@ define void @foo(<3 x float> %in, ptr nocapture %out) nounwind {
; AVX-LABEL: foo:
; AVX: # %bb.0:
; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; AVX-NEXT: movl $255, %eax
; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0
; AVX-NEXT: vmovd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 845f128ea9412f..fb9c320ee2757f 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -701,13 +701,13 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) {
; CHECK-NEXT: pand %xmm2, %xmm0
; CHECK-NEXT: pandn %xmm1, %xmm2
; CHECK-NEXT: por %xmm0, %xmm2
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
-; CHECK-NEXT: movdqa %xmm2, %xmm1
-; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
-; CHECK-NEXT: pand %xmm1, %xmm2
-; CHECK-NEXT: pandn %xmm0, %xmm1
-; CHECK-NEXT: por %xmm2, %xmm1
-; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <4294934528,4294934528,u,u>
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT: pand %xmm0, %xmm2
+; CHECK-NEXT: pandn %xmm1, %xmm0
+; CHECK-NEXT: por %xmm2, %xmm0
+; CHECK-NEXT: packssdw %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
@@ -2265,13 +2265,13 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) {
; CHECK-NEXT: pand %xmm2, %xmm0
; CHECK-NEXT: pandn %xmm1, %xmm2
; CHECK-NEXT: por %xmm0, %xmm2
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
-; CHECK-NEXT: movdqa %xmm2, %xmm1
-; CHECK-NEXT: pcmpgtd %xmm0, %xmm1
-; CHECK-NEXT: pand %xmm1, %xmm2
-; CHECK-NEXT: pandn %xmm0, %xmm1
-; CHECK-NEXT: por %xmm2, %xmm1
-; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = <4294934528,4294934528,u,u>
+; CHECK-NEXT: movdqa %xmm2, %xmm0
+; CHECK-NEXT: pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT: pand %xmm0, %xmm2
+; CHECK-NEXT: pandn %xmm1, %xmm0
+; CHECK-NEXT: por %xmm2, %xmm0
+; CHECK-NEXT: packssdw %xmm0, %xmm0
; CHECK-NEXT: retq
entry:
%conv = fptosi <2 x double> %x to <2 x i32>
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index c50ff98d4de06e..7284f316d1d406 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -5185,9 +5185,8 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: pandn %xmm0, %xmm3
; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm3
+; SSE2-NEXT: packssdw %xmm3, %xmm3
+; SSE2-NEXT: packsswb %xmm3, %xmm3
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: movmskps %xmm2, %ecx
; SSE2-NEXT: xorl $15, %ecx
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 70e36cbb099dea..6d4c593d6eb7fe 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -4956,11 +4956,9 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; SSE2-NEXT: pxor %xmm0, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: packuswb %xmm4, %xmm4
; SSE2-NEXT: packuswb %xmm4, %xmm4
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 503a294f2f9300..bd01c2f22ef07f 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -114,22 +114,14 @@ define <4 x i16> @ashr_mulhw_v4i16(<4 x i32> %a, <4 x i32> %b) {
; SSE41-NEXT: pmulhw %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX2-LABEL: ashr_mulhw_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: ashr_mulhw_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: retq
+; AVX-LABEL: ashr_mulhw_v4i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
+; AVX-NEXT: vpackusdw %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT: vpmulhw %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
%a1 = ashr <4 x i32> %a, <i32 16, i32 16, i32 16, i32 16>
%b1 = ashr <4 x i32> %b, <i32 16, i32 16, i32 16, i32 16>
%c = mul <4 x i32> %a1, %b1
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index ec03b1f4ff4897..4983ac4ae3afb3 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -4169,37 +4169,21 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
}
define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"="256" {
-; SSE2-LABEL: trunc_packus_v4i32_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v4i32_v4i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v4i32_v4i8:
; SSE41: # %bb.0:
@@ -4274,39 +4258,22 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"
}
define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
-; SSE2-LABEL: trunc_packus_v4i32_v4i8_store:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: movd %xmm1, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v4i32_v4i8_store:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm0, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movd %xmm1, (%rdi)
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8_store:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: por %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi)
+; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_packus_v4i32_v4i8_store:
; SSE41: # %bb.0:
@@ -4328,16 +4295,26 @@ define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
; AVX1-NEXT: vmovd %xmm0, (%rdi)
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_packus_v4i32_v4i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8_store:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8_store:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi)
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store:
; AVX512F: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 11b7a43966c75a..8ad5eb14b45308 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -3920,41 +3920,23 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
}
define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) {
-; SSE2-LABEL: trunc_ssat_v4i32_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v4i32_v4i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
+; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v4i32_v4i8:
; SSE41: # %bb.0:
@@ -4019,43 +4001,24 @@ define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) {
}
define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
-; SSE2-LABEL: trunc_ssat_v4i32_v4i8_store:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm1
-; SSE2-NEXT: movd %xmm1, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v4i32_v4i8_store:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm1, %xmm2
-; SSSE3-NEXT: por %xmm0, %xmm2
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pandn %xmm0, %xmm1
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movd %xmm1, (%rdi)
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8_store:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127]
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: por %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
+; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi)
+; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_ssat_v4i32_v4i8_store:
; SSE41: # %bb.0:
@@ -4075,16 +4038,26 @@ define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
; AVX1-NEXT: vmovd %xmm0, (%rdi)
; AVX1-NEXT: retq
;
-; AVX2-LABEL: trunc_ssat_v4i32_v4i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: trunc_ssat_v4i32_v4i8_store:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
+; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
+; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: trunc_ssat_v4i32_v4i8_store:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
+; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
+; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi)
+; AVX2-FAST-NEXT: retq
;
; AVX512F-LABEL: trunc_ssat_v4i32_v4i8_store:
; AVX512F: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index da4011d8c420c8..1e17299c75f9ef 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -2981,33 +2981,18 @@ define <16 x i8> @trunc_usat_v16i64_v16i8(ptr %p0) {
}
define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) {
-; SSE2-LABEL: trunc_usat_v4i32_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v4i32_v4i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: trunc_usat_v4i32_v4i8:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v4i32_v4i8:
; SSE41: # %bb.0:
@@ -3066,35 +3051,19 @@ define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) {
}
define void @trunc_usat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
-; SSE2-LABEL: trunc_usat_v4i32_v4i8_store:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: movd %xmm2, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v4i32_v4i8_store:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movd %xmm2, (%rdi)
-; SSSE3-NEXT: retq
+; SSE2-SSSE3-LABEL: trunc_usat_v4i32_v4i8_store:
+; SSE2-SSSE3: # %bb.0:
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
+; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
+; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: por %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm2
+; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm2
+; SSE2-SSSE3-NEXT: movd %xmm2, (%rdi)
+; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v4i32_v4i8_store:
; SSE41: # %bb.0:
More information about the llvm-commits
mailing list