[llvm] 7b77dd6 - [X86] SimplifyDemandedBitsForTargetNode - add X86ISD::ANDNP handling
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 27 06:23:10 PDT 2023
Author: Simon Pilgrim
Date: 2023-06-27T14:23:00+01:00
New Revision: 7b77dd6afdee0c3f71c456d890940dc7618e2659
URL: https://github.com/llvm/llvm-project/commit/7b77dd6afdee0c3f71c456d890940dc7618e2659
DIFF: https://github.com/llvm/llvm-project/commit/7b77dd6afdee0c3f71c456d890940dc7618e2659.diff
LOG: [X86] SimplifyDemandedBitsForTargetNode - add X86ISD::ANDNP handling
Add X86ISD::ANDNP handling to targetShrinkDemandedConstant as well, which allows us to replace a lot of truncated masks with (rematerializable) allones values
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/fpclamptosat_vec.ll
llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
llvm/test/CodeGen/X86/psubus.ll
llvm/test/CodeGen/X86/udiv_fix_sat.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b59bcc415dac5..2f81399af049c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38669,14 +38669,14 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
return false;
};
// For vectors - if we have a constant, then try to sign extend.
- // TODO: Handle AND/ANDN cases.
+ // TODO: Handle AND cases.
unsigned ActiveBits = DemandedBits.getActiveBits();
if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
- (Opcode == ISD::OR || Opcode == ISD::XOR) &&
+ (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
- VT.getVectorNumElements());
+ VT.getVectorNumElements());
SDValue NewC =
TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
@@ -43818,6 +43818,31 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
break;
}
+ case X86ISD::ANDNP: {
+ KnownBits Known2;
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
+ Known, TLO, Depth + 1))
+ return true;
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+
+ if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
+ OriginalDemandedElts, Known2, TLO, Depth + 1))
+ return true;
+ assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
+
+ // If the RHS is a constant, see if we can simplify it.
+ if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
+ OriginalDemandedElts, TLO))
+ return true;
+
+ // ANDNP = (~Op0 & Op1);
+ Known.One &= Known2.Zero;
+ Known.Zero |= Known2.One;
+ break;
+ }
case X86ISD::VSHLI: {
SDValue Op0 = Op.getOperand(0);
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index b319d4fc92503..695ab33c79815 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -82,8 +82,9 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) {
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: pand %xmm3, %xmm0
+; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pandn %xmm2, %xmm0
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: retq
@@ -270,31 +271,31 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) {
; CHECK-NEXT: orq %rax, %rdx
; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; CHECK-NEXT: movdqa %xmm0, %xmm4
-; CHECK-NEXT: pxor %xmm3, %xmm4
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; CHECK-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647,2147483647,2147483647]
-; CHECK-NEXT: movdqa %xmm6, %xmm7
-; CHECK-NEXT: pcmpgtd %xmm5, %xmm7
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm3, %xmm4
-; CHECK-NEXT: pand %xmm7, %xmm4
-; CHECK-NEXT: pand %xmm4, %xmm0
-; CHECK-NEXT: pandn %xmm2, %xmm4
-; CHECK-NEXT: por %xmm0, %xmm4
+; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; CHECK-NEXT: movdqa %xmm0, %xmm3
+; CHECK-NEXT: pxor %xmm2, %xmm3
+; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-NEXT: movdqa %xmm5, %xmm6
+; CHECK-NEXT: pcmpgtd %xmm4, %xmm6
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; CHECK-NEXT: pcmpeqd %xmm2, %xmm3
+; CHECK-NEXT: pand %xmm6, %xmm3
+; CHECK-NEXT: pcmpeqd %xmm4, %xmm4
+; CHECK-NEXT: pand %xmm3, %xmm0
+; CHECK-NEXT: pandn %xmm4, %xmm3
+; CHECK-NEXT: por %xmm0, %xmm3
; CHECK-NEXT: movdqa %xmm1, %xmm0
-; CHECK-NEXT: pxor %xmm3, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; CHECK-NEXT: pcmpgtd %xmm5, %xmm6
+; CHECK-NEXT: pxor %xmm2, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; CHECK-NEXT: pcmpgtd %xmm6, %xmm5
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm3, %xmm0
-; CHECK-NEXT: pand %xmm6, %xmm0
+; CHECK-NEXT: pcmpeqd %xmm2, %xmm0
+; CHECK-NEXT: pand %xmm5, %xmm0
; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: pandn %xmm2, %xmm0
+; CHECK-NEXT: pandn %xmm4, %xmm0
; CHECK-NEXT: por %xmm1, %xmm0
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
; CHECK-NEXT: retq
entry:
%conv = fptoui <4 x float> %x to <4 x i64>
@@ -550,32 +551,32 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) {
; CHECK-NEXT: movq %rdx, %xmm0
; CHECK-NEXT: punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; CHECK-NEXT: movdqa %xmm0, %xmm3
-; CHECK-NEXT: pxor %xmm2, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; CHECK-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
-; CHECK-NEXT: movdqa %xmm5, %xmm6
-; CHECK-NEXT: pcmpgtd %xmm4, %xmm6
-; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm2, %xmm3
-; CHECK-NEXT: pand %xmm6, %xmm3
-; CHECK-NEXT: pand %xmm3, %xmm0
-; CHECK-NEXT: pandn %xmm1, %xmm3
-; CHECK-NEXT: por %xmm0, %xmm3
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
+; CHECK-NEXT: movdqa %xmm0, %xmm2
+; CHECK-NEXT: pxor %xmm1, %xmm2
+; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
+; CHECK-NEXT: movdqa %xmm4, %xmm5
+; CHECK-NEXT: pcmpgtd %xmm3, %xmm5
+; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm2
+; CHECK-NEXT: pand %xmm5, %xmm2
+; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
+; CHECK-NEXT: pand %xmm2, %xmm0
+; CHECK-NEXT: pandn %xmm3, %xmm2
+; CHECK-NEXT: por %xmm0, %xmm2
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; CHECK-NEXT: movdqa %xmm6, %xmm0
-; CHECK-NEXT: pxor %xmm2, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; CHECK-NEXT: pcmpgtd %xmm4, %xmm5
+; CHECK-NEXT: pxor %xmm1, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; CHECK-NEXT: pcmpgtd %xmm5, %xmm4
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT: pcmpeqd %xmm2, %xmm0
-; CHECK-NEXT: pand %xmm5, %xmm0
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT: pand %xmm4, %xmm0
; CHECK-NEXT: pand %xmm0, %xmm6
-; CHECK-NEXT: pandn %xmm1, %xmm0
+; CHECK-NEXT: pandn %xmm3, %xmm0
; CHECK-NEXT: por %xmm6, %xmm0
-; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; CHECK-NEXT: addq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 8
; CHECK-NEXT: retq
@@ -733,8 +734,9 @@ define <2 x i16> @utest_f64i16(<2 x double> %x) {
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = <2147549183,2147549183,u,u>
; CHECK-NEXT: pcmpgtd %xmm1, %xmm2
; CHECK-NEXT: andpd %xmm2, %xmm0
-; CHECK-NEXT: andnpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-NEXT: orpd %xmm0, %xmm2
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pandn %xmm1, %xmm2
+; CHECK-NEXT: por %xmm0, %xmm2
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
; CHECK-NEXT: retq
entry:
@@ -802,7 +804,8 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) {
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
; CHECK-NEXT: pcmpgtd %xmm1, %xmm2
; CHECK-NEXT: pand %xmm2, %xmm0
-; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pandn %xmm1, %xmm2
; CHECK-NEXT: por %xmm0, %xmm2
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
@@ -1656,8 +1659,9 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) {
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-NEXT: pand %xmm3, %xmm0
+; CHECK-NEXT: pcmpeqd %xmm2, %xmm2
; CHECK-NEXT: pand %xmm0, %xmm1
-; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pandn %xmm2, %xmm0
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; CHECK-NEXT: retq
@@ -1849,7 +1853,7 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) {
; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; CHECK-NEXT: pcmpeqd %xmm2, %xmm3
; CHECK-NEXT: pand %xmm6, %xmm3
-; CHECK-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
+; CHECK-NEXT: pcmpeqd %xmm4, %xmm4
; CHECK-NEXT: pand %xmm3, %xmm0
; CHECK-NEXT: pandn %xmm4, %xmm3
; CHECK-NEXT: por %xmm0, %xmm3
@@ -2124,7 +2128,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; CHECK-NEXT: pcmpeqd %xmm1, %xmm2
; CHECK-NEXT: pand %xmm5, %xmm2
-; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; CHECK-NEXT: pcmpeqd %xmm3, %xmm3
; CHECK-NEXT: pand %xmm2, %xmm0
; CHECK-NEXT: pandn %xmm3, %xmm2
; CHECK-NEXT: por %xmm0, %xmm2
@@ -2292,8 +2296,9 @@ define <2 x i16> @utest_f64i16_mm(<2 x double> %x) {
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = <2147549183,2147549183,u,u>
; CHECK-NEXT: pcmpgtd %xmm1, %xmm2
; CHECK-NEXT: andpd %xmm2, %xmm0
-; CHECK-NEXT: andnpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-NEXT: orpd %xmm0, %xmm2
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pandn %xmm1, %xmm2
+; CHECK-NEXT: por %xmm0, %xmm2
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
; CHECK-NEXT: retq
entry:
@@ -2356,7 +2361,8 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) {
; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
; CHECK-NEXT: pcmpgtd %xmm1, %xmm2
; CHECK-NEXT: pand %xmm2, %xmm0
-; CHECK-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pandn %xmm1, %xmm2
; CHECK-NEXT: por %xmm0, %xmm2
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index b32f4959e16ba..684e954d2c287 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -12,50 +12,49 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm8, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm10, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm9[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm1, %xmm6
+; SSE2-NEXT: pxor %xmm9, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
-; SSE2-NEXT: movdqa %xmm7, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm9
-; SSE2-NEXT: pand %xmm12, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm1
-; SSE2-NEXT: pandn %xmm6, %xmm9
-; SSE2-NEXT: por %xmm1, %xmm9
+; SSE2-NEXT: movdqa %xmm7, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
+; SSE2-NEXT: pand %xmm11, %xmm10
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT: pand %xmm10, %xmm1
+; SSE2-NEXT: pandn %xmm6, %xmm10
+; SSE2-NEXT: por %xmm1, %xmm10
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
+; SSE2-NEXT: pxor %xmm9, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[0,0,2,2]
; SSE2-NEXT: movdqa %xmm7, %xmm12
; SSE2-NEXT: pcmpgtd %xmm11, %xmm12
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
; SSE2-NEXT: pand %xmm12, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn %xmm6, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm9[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[0,2]
; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,0,2,2]
+; SSE2-NEXT: pxor %xmm9, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,0,2,2]
; SSE2-NEXT: movdqa %xmm7, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm10, %xmm11
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: pxor %xmm10, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm9[0,0,2,2]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm2, %xmm10
+; SSE2-NEXT: pxor %xmm9, %xmm10
+; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[0,0,2,2]
; SSE2-NEXT: pcmpgtd %xmm12, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm9
+; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
; SSE2-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm10
-; SSE2-NEXT: pxor %xmm10, %xmm5
+; SSE2-NEXT: pxor %xmm6, %xmm5
; SSE2-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE2-NEXT: pxor %xmm10, %xmm4
+; SSE2-NEXT: pxor %xmm6, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: packsswb %xmm4, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %eax
@@ -65,7 +64,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: movss %xmm1, (%rdi)
; SSE2-NEXT: .LBB0_2: # %else
; SSE2-NEXT: pand %xmm11, %xmm0
-; SSE2-NEXT: pand %xmm7, %xmm9
+; SSE2-NEXT: pand %xmm7, %xmm10
; SSE2-NEXT: testb $2, %al
; SSE2-NEXT: je .LBB0_4
; SSE2-NEXT: # %bb.3: # %cond.store1
@@ -74,8 +73,8 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: .LBB0_4: # %else2
; SSE2-NEXT: pand %xmm0, %xmm3
; SSE2-NEXT: pandn %xmm6, %xmm0
-; SSE2-NEXT: pand %xmm9, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm9
+; SSE2-NEXT: pand %xmm10, %xmm2
+; SSE2-NEXT: pandn %xmm6, %xmm10
; SSE2-NEXT: testb $4, %al
; SSE2-NEXT: je .LBB0_6
; SSE2-NEXT: # %bb.5: # %cond.store3
@@ -83,7 +82,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: movd %xmm4, 8(%rdi)
; SSE2-NEXT: .LBB0_6: # %else4
; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: por %xmm9, %xmm2
+; SSE2-NEXT: por %xmm10, %xmm2
; SSE2-NEXT: testb $8, %al
; SSE2-NEXT: je .LBB0_8
; SSE2-NEXT: # %bb.7: # %cond.store5
@@ -314,20 +313,20 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-LABEL: truncstore_v8i64_v8i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm2, %xmm7
; SSE2-NEXT: pxor %xmm9, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm7[0,0,2,2]
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm10, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: pand %xmm12, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm2, %xmm7
+; SSE2-NEXT: movdqa %xmm10, %xmm11
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm11
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm9, %xmm8
+; SSE2-NEXT: pand %xmm11, %xmm8
+; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
+; SSE2-NEXT: pand %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm7, %xmm8
+; SSE2-NEXT: por %xmm2, %xmm8
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: pxor %xmm9, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm2[0,0,2,2]
@@ -337,7 +336,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
; SSE2-NEXT: pand %xmm12, %xmm2
; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
+; SSE2-NEXT: pandn %xmm7, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: pxor %xmm9, %xmm3
@@ -348,7 +347,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
; SSE2-NEXT: pand %xmm12, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
+; SSE2-NEXT: pandn %xmm7, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm9, %xmm0
@@ -358,7 +357,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
; SSE2-NEXT: pand %xmm10, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm0
+; SSE2-NEXT: pandn %xmm7, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -367,15 +366,14 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SSE2-NEXT: pcmpeqd %xmm6, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm7, %xmm5
; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
+; SSE2-NEXT: pxor %xmm7, %xmm4
; SSE2-NEXT: packssdw %xmm5, %xmm4
; SSE2-NEXT: packsswb %xmm4, %xmm4
; SSE2-NEXT: pmovmskb %xmm4, %eax
@@ -1281,31 +1279,31 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i32:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647,2147483647,2147483647]
-; SSE2-NEXT: movdqa %xmm8, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pand %xmm9, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pandn %xmm4, %xmm6
-; SSE2-NEXT: por %xmm1, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm1
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm1, %xmm5
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT: pand %xmm7, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm1
+; SSE2-NEXT: pandn %xmm6, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
+; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
; SSE2-NEXT: movmskps %xmm3, %eax
; SSE2-NEXT: xorl $15, %eax
@@ -1476,33 +1474,33 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i16:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm8, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pand %xmm9, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm6
-; SSE2-NEXT: por %xmm0, %xmm6
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm0, %xmm5
+; SSE2-NEXT: pxor %xmm4, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm5
+; SSE2-NEXT: pand %xmm8, %xmm5
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm5
+; SSE2-NEXT: por %xmm0, %xmm5
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm4, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
@@ -1759,32 +1757,33 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE2-LABEL: truncstore_v4i64_v4i8:
; SSE2: # %bb.0:
; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm8, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm9
+; SSE2-NEXT: pxor %xmm5, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm7, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm9, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm4
+; SSE2-NEXT: pand %xmm8, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm6, %xmm6
; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: pandn %xmm6, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: pxor %xmm5, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE2-NEXT: pand %xmm8, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
+; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm0
+; SSE2-NEXT: pandn %xmm6, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pand %xmm5, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm4
; SSE2-NEXT: packuswb %xmm0, %xmm4
; SSE2-NEXT: packuswb %xmm4, %xmm4
; SSE2-NEXT: packuswb %xmm4, %xmm4
@@ -2056,8 +2055,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
@@ -2195,8 +2195,9 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -2373,8 +2374,9 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: packuswb %xmm3, %xmm3
@@ -4774,8 +4776,9 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; SSE2-NEXT: pxor %xmm0, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
@@ -5000,8 +5003,9 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; SSE2-NEXT: pxor %xmm0, %xmm3
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: pandn %xmm3, %xmm4
; SSE2-NEXT: por %xmm0, %xmm4
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSE2-NEXT: packuswb %xmm4, %xmm4
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 28d4a882b21ad..0447b829a601f 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -775,7 +775,7 @@ define <8 x i16> @test13(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: pandn %xmm4, %xmm6
; SSSE3-NEXT: por %xmm2, %xmm6
@@ -1041,7 +1041,7 @@ define <8 x i16> @test15(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: pandn %xmm4, %xmm6
; SSSE3-NEXT: por %xmm2, %xmm6
@@ -1564,7 +1564,7 @@ define <8 x i16> @psubus_8i32_max(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: pandn %xmm4, %xmm6
; SSSE3-NEXT: por %xmm2, %xmm6
@@ -1638,7 +1638,7 @@ define <8 x i16> @psubus_8i64_max(<8 x i16> %x, <8 x i64> %y) nounwind {
; SSE2OR3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
; SSE2OR3-NEXT: pcmpeqd %xmm5, %xmm8
; SSE2OR3-NEXT: pand %xmm9, %xmm8
-; SSE2OR3-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535]
+; SSE2OR3-NEXT: pcmpeqd %xmm7, %xmm7
; SSE2OR3-NEXT: pand %xmm8, %xmm2
; SSE2OR3-NEXT: pandn %xmm7, %xmm8
; SSE2OR3-NEXT: por %xmm2, %xmm8
@@ -1936,7 +1936,7 @@ define <8 x i16> @psubus_i16_i32_max_swapped(<8 x i16> %x, <8 x i32> %y) nounwin
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: pandn %xmm4, %xmm6
; SSSE3-NEXT: por %xmm2, %xmm6
@@ -2031,7 +2031,7 @@ define <8 x i16> @psubus_i16_i32_min(<8 x i16> %x, <8 x i32> %y) nounwind {
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: pandn %xmm4, %xmm6
; SSSE3-NEXT: por %xmm2, %xmm6
@@ -2620,7 +2620,7 @@ define <8 x i16> @test32(<8 x i16> %a0, <8 x i32> %a1) {
; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: movdqa %xmm5, %xmm6
; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
; SSSE3-NEXT: pand %xmm6, %xmm2
; SSSE3-NEXT: pandn %xmm4, %xmm6
; SSSE3-NEXT: por %xmm2, %xmm6
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index d4d18c4307f33..9b12d5d4b1e5d 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -341,7 +341,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
; X64-NEXT: pand %xmm7, %xmm10
; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,3,3]
; X64-NEXT: por %xmm10, %xmm3
-; X64-NEXT: movdqa {{.*#+}} xmm7 = [8589934591,8589934591]
+; X64-NEXT: pcmpeqd %xmm7, %xmm7
; X64-NEXT: pand %xmm3, %xmm8
; X64-NEXT: pandn %xmm7, %xmm3
; X64-NEXT: por %xmm8, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 10785126e668a..61ba5cf3194b2 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -30,8 +30,9 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE2-NEXT: retq
@@ -46,8 +47,9 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSSE3-NEXT: retq
@@ -137,8 +139,9 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
@@ -154,8 +157,9 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSSE3-NEXT: movq %xmm0, (%rdi)
@@ -262,8 +266,9 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; SSE2-NEXT: pand %xmm4, %xmm1
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSE2-NEXT: por %xmm1, %xmm4
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pandn %xmm1, %xmm3
; SSE2-NEXT: por %xmm3, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
; SSE2-NEXT: retq
@@ -290,8 +295,9 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; SSSE3-NEXT: pand %xmm4, %xmm1
; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSSE3-NEXT: pandn %xmm1, %xmm3
; SSSE3-NEXT: por %xmm3, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
; SSSE3-NEXT: retq
@@ -445,52 +451,52 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) {
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa 32(%rdi), %xmm6
; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pxor %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm7
-; SSE2-NEXT: pand %xmm9, %xmm7
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm1, %xmm5
+; SSE2-NEXT: pxor %xmm3, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
+; SSE2-NEXT: movdqa %xmm4, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
+; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: pand %xmm8, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm7
+; SSE2-NEXT: pandn %xmm5, %xmm7
; SSE2-NEXT: por %xmm1, %xmm7
; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
+; SSE2-NEXT: pxor %xmm3, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
-; SSE2-NEXT: movdqa %xmm5, %xmm9
+; SSE2-NEXT: movdqa %xmm4, %xmm9
; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
; SSE2-NEXT: pand %xmm9, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm6
-; SSE2-NEXT: pandn %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm1
; SSE2-NEXT: por %xmm6, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: movdqa %xmm5, %xmm8
+; SSE2-NEXT: movdqa %xmm4, %xmm8
; SSE2-NEXT: pcmpgtd %xmm7, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm6
; SSE2-NEXT: pand %xmm8, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
; SSE2-NEXT: por %xmm0, %xmm6
; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
+; SSE2-NEXT: pxor %xmm3, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm5
+; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm5, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm4, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm0
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
; SSE2-NEXT: retq
@@ -501,52 +507,52 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(ptr %p0) {
; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
; SSSE3-NEXT: movdqa 32(%rdi), %xmm6
; SSSE3-NEXT: movdqa 48(%rdi), %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm1, %xmm7
-; SSSE3-NEXT: pxor %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
-; SSSE3-NEXT: movdqa %xmm5, %xmm9
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7
-; SSSE3-NEXT: pand %xmm9, %xmm7
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa %xmm1, %xmm5
+; SSSE3-NEXT: pxor %xmm3, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
+; SSSE3-NEXT: movdqa %xmm4, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
+; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7
+; SSSE3-NEXT: pand %xmm8, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pandn %xmm3, %xmm7
+; SSSE3-NEXT: pandn %xmm5, %xmm7
; SSSE3-NEXT: por %xmm1, %xmm7
; SSSE3-NEXT: movdqa %xmm6, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
+; SSSE3-NEXT: pxor %xmm3, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
-; SSSE3-NEXT: movdqa %xmm5, %xmm9
+; SSSE3-NEXT: movdqa %xmm4, %xmm9
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
; SSSE3-NEXT: pand %xmm9, %xmm1
; SSSE3-NEXT: pand %xmm1, %xmm6
-; SSSE3-NEXT: pandn %xmm3, %xmm1
+; SSSE3-NEXT: pandn %xmm5, %xmm1
; SSSE3-NEXT: por %xmm6, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pxor %xmm4, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: movdqa %xmm5, %xmm8
+; SSSE3-NEXT: movdqa %xmm4, %xmm8
; SSSE3-NEXT: pcmpgtd %xmm7, %xmm8
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6
; SSSE3-NEXT: pand %xmm8, %xmm6
; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pandn %xmm3, %xmm6
+; SSSE3-NEXT: pandn %xmm5, %xmm6
; SSSE3-NEXT: por %xmm0, %xmm6
; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
+; SSSE3-NEXT: pxor %xmm3, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm5
+; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0
+; SSSE3-NEXT: pand %xmm4, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pandn %xmm3, %xmm0
+; SSSE3-NEXT: pandn %xmm5, %xmm0
; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
; SSSE3-NEXT: retq
@@ -720,8 +726,9 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -737,8 +744,9 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -841,8 +849,9 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -859,8 +868,9 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -963,66 +973,66 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) {
; SSE2-LABEL: trunc_usat_v4i64_v4i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v4i64_v4i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm6, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0
-; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm0
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSSE3-NEXT: retq
@@ -1130,33 +1140,33 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) {
define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; SSE2-LABEL: trunc_usat_v4i64_v4i16_store:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE2-NEXT: movq %xmm1, (%rdi)
@@ -1164,33 +1174,33 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
;
; SSSE3-LABEL: trunc_usat_v4i64_v4i16_store:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm6, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm0, %xmm3
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm3, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm2, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0
-; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm0
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSSE3-NEXT: movq %xmm1, (%rdi)
@@ -1304,57 +1314,57 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
; SSE2-LABEL: trunc_usat_v8i64_v8i16:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm5
+; SSE2-NEXT: movdqa (%rdi), %xmm4
; SSE2-NEXT: movdqa 16(%rdi), %xmm0
; SSE2-NEXT: movdqa 32(%rdi), %xmm6
; SSE2-NEXT: movdqa 48(%rdi), %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm4, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; SSE2-NEXT: movdqa %xmm3, %xmm8
+; SSE2-NEXT: pcmpgtd %xmm5, %xmm8
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm9, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm1
+; SSE2-NEXT: pand %xmm8, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm5, %xmm5
; SSE2-NEXT: pand %xmm1, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm1
+; SSE2-NEXT: pandn %xmm5, %xmm1
; SSE2-NEXT: por %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm7, %xmm6
-; SSE2-NEXT: pxor %xmm3, %xmm6
+; SSE2-NEXT: pxor %xmm2, %xmm6
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: movdqa %xmm3, %xmm9
; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm6
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm6
; SSE2-NEXT: pand %xmm9, %xmm6
; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pandn %xmm2, %xmm6
+; SSE2-NEXT: pandn %xmm5, %xmm6
; SSE2-NEXT: por %xmm7, %xmm6
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pxor %xmm3, %xmm7
+; SSE2-NEXT: movdqa %xmm4, %xmm7
+; SSE2-NEXT: pxor %xmm2, %xmm7
; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: movdqa %xmm4, %xmm9
+; SSE2-NEXT: movdqa %xmm3, %xmm9
; SSE2-NEXT: pcmpgtd %xmm8, %xmm9
; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm7
; SSE2-NEXT: pand %xmm9, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm5
-; SSE2-NEXT: pandn %xmm2, %xmm7
-; SSE2-NEXT: por %xmm5, %xmm7
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSE2-NEXT: pand %xmm7, %xmm4
+; SSE2-NEXT: pandn %xmm5, %xmm7
+; SSE2-NEXT: por %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pxor %xmm2, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm4
+; SSE2-NEXT: pand %xmm4, %xmm0
+; SSE2-NEXT: pandn %xmm5, %xmm4
+; SSE2-NEXT: por %xmm0, %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
@@ -1369,57 +1379,57 @@ define <8 x i16> @trunc_usat_v8i64_v8i16(ptr %p0) {
;
; SSSE3-LABEL: trunc_usat_v8i64_v8i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa (%rdi), %xmm5
+; SSSE3-NEXT: movdqa (%rdi), %xmm4
; SSSE3-NEXT: movdqa 16(%rdi), %xmm0
; SSSE3-NEXT: movdqa 32(%rdi), %xmm6
; SSSE3-NEXT: movdqa 48(%rdi), %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm6, %xmm1
-; SSSE3-NEXT: pxor %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,0,2,2]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm4, %xmm9
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
+; SSSE3-NEXT: pxor %xmm2, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm3, %xmm8
+; SSSE3-NEXT: pcmpgtd %xmm5, %xmm8
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm1
-; SSSE3-NEXT: pand %xmm9, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1
+; SSSE3-NEXT: pand %xmm8, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5
; SSSE3-NEXT: pand %xmm1, %xmm6
-; SSSE3-NEXT: pandn %xmm2, %xmm1
+; SSSE3-NEXT: pandn %xmm5, %xmm1
; SSSE3-NEXT: por %xmm6, %xmm1
; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pxor %xmm3, %xmm6
+; SSSE3-NEXT: pxor %xmm2, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2]
-; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: movdqa %xmm3, %xmm9
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm6
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm6
; SSSE3-NEXT: pand %xmm9, %xmm6
; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pandn %xmm2, %xmm6
+; SSSE3-NEXT: pandn %xmm5, %xmm6
; SSSE3-NEXT: por %xmm7, %xmm6
-; SSSE3-NEXT: movdqa %xmm5, %xmm7
-; SSSE3-NEXT: pxor %xmm3, %xmm7
+; SSSE3-NEXT: movdqa %xmm4, %xmm7
+; SSSE3-NEXT: pxor %xmm2, %xmm7
; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSSE3-NEXT: movdqa %xmm4, %xmm9
+; SSSE3-NEXT: movdqa %xmm3, %xmm9
; SSSE3-NEXT: pcmpgtd %xmm8, %xmm9
; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm7
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm7
; SSSE3-NEXT: pand %xmm9, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm5
-; SSSE3-NEXT: pandn %xmm2, %xmm7
-; SSSE3-NEXT: por %xmm5, %xmm7
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
+; SSSE3-NEXT: pand %xmm7, %xmm4
+; SSSE3-NEXT: pandn %xmm5, %xmm7
+; SSSE3-NEXT: por %xmm4, %xmm7
+; SSSE3-NEXT: movdqa %xmm0, %xmm4
+; SSSE3-NEXT: pxor %xmm2, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm4
+; SSSE3-NEXT: pand %xmm3, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm0
+; SSSE3-NEXT: pandn %xmm5, %xmm4
+; SSSE3-NEXT: por %xmm0, %xmm4
+; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm7[0,2,2,3]
; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
@@ -1561,8 +1571,9 @@ define <4 x i16> @trunc_usat_v4i32_v4i16(<4 x i32> %a0) {
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
@@ -1575,8 +1586,9 @@ define <4 x i16> @trunc_usat_v4i32_v4i16(<4 x i32> %a0) {
; SSSE3-NEXT: pxor %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm2
; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; SSSE3-NEXT: retq
@@ -1643,8 +1655,9 @@ define void @trunc_usat_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) {
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
@@ -1658,8 +1671,9 @@ define void @trunc_usat_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) {
; SSSE3-NEXT: pxor %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm2
; SSSE3-NEXT: por %xmm0, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: movq %xmm2, (%rdi)
@@ -1751,25 +1765,25 @@ define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
;
; SSSE3-LABEL: trunc_usat_v8i32_v8i16:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm6
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSSE3-NEXT: movdqa %xmm4, %xmm5
; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm1, %xmm5
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3
+; SSSE3-NEXT: pand %xmm5, %xmm0
+; SSSE3-NEXT: pandn %xmm3, %xmm5
+; SSSE3-NEXT: por %xmm5, %xmm0
+; SSSE3-NEXT: pxor %xmm1, %xmm2
+; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
+; SSSE3-NEXT: pand %xmm4, %xmm1
+; SSSE3-NEXT: pandn %xmm3, %xmm4
+; SSSE3-NEXT: por %xmm1, %xmm4
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm1, %xmm5
+; SSSE3-NEXT: pshufb %xmm1, %xmm4
; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v8i32_v8i16:
@@ -2001,8 +2015,9 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -2020,8 +2035,9 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
@@ -2110,8 +2126,9 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: packuswb %xmm1, %xmm1
@@ -2131,8 +2148,9 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: por %xmm0, %xmm1
; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: movd %xmm1, %eax
@@ -2220,67 +2238,68 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
; SSE2-LABEL: trunc_usat_v4i64_v4i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm4, %xmm0
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm6, %xmm3
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm3, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pxor %xmm2, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
+; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pandn %xmm4, %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm3
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: packuswb %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v4i64_v4i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm6, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm4
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
+; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa %xmm0, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pand %xmm6, %xmm3
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
+; SSSE3-NEXT: pand %xmm3, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm3, %xmm0
+; SSSE3-NEXT: movdqa %xmm1, %xmm3
+; SSSE3-NEXT: pxor %xmm2, %xmm3
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
+; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd %xmm2, %xmm3
+; SSSE3-NEXT: pand %xmm5, %xmm3
+; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pandn %xmm4, %xmm3
+; SSSE3-NEXT: por %xmm1, %xmm3
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm1, %xmm4
+; SSSE3-NEXT: pshufb %xmm1, %xmm3
; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v4i64_v4i8:
@@ -2391,63 +2410,64 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; SSE2-LABEL: trunc_usat_v4i64_v4i8_store:
; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE2-NEXT: movdqa %xmm0, %xmm2
+; SSE2-NEXT: pxor %xmm3, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSE2-NEXT: movdqa %xmm5, %xmm6
+; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
+; SSE2-NEXT: pand %xmm6, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
+; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm2
+; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
+; SSE2-NEXT: pxor %xmm3, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm6, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm3, %xmm0
+; SSE2-NEXT: pand %xmm5, %xmm0
; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm0
+; SSE2-NEXT: pandn %xmm4, %xmm0
; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: packuswb %xmm0, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm3
-; SSE2-NEXT: movd %xmm3, (%rdi)
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0]
+; SSE2-NEXT: pand %xmm1, %xmm0
+; SSE2-NEXT: pand %xmm1, %xmm2
+; SSE2-NEXT: packuswb %xmm0, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: packuswb %xmm2, %xmm2
+; SSE2-NEXT: movd %xmm2, (%rdi)
; SSE2-NEXT: retq
;
; SSSE3-LABEL: trunc_usat_v4i64_v4i8_store:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm6, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT: pxor %xmm3, %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
+; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
+; SSSE3-NEXT: movdqa %xmm5, %xmm6
+; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
-; SSSE3-NEXT: pand %xmm7, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2
+; SSSE3-NEXT: pand %xmm6, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4
; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn %xmm3, %xmm2
+; SSSE3-NEXT: pandn %xmm4, %xmm2
; SSSE3-NEXT: por %xmm0, %xmm2
; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
+; SSSE3-NEXT: pxor %xmm3, %xmm0
+; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm6, %xmm0
+; SSSE3-NEXT: pcmpeqd %xmm3, %xmm0
+; SSSE3-NEXT: pand %xmm5, %xmm0
; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm3, %xmm0
+; SSSE3-NEXT: pandn %xmm4, %xmm0
; SSSE3-NEXT: por %xmm1, %xmm0
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
; SSSE3-NEXT: pshufb %xmm1, %xmm0
@@ -3489,8 +3509,9 @@ define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) {
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm2, %xmm0
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: packuswb %xmm0, %xmm0
@@ -3503,8 +3524,9 @@ define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) {
; SSSE3-NEXT: pxor %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm2
; SSSE3-NEXT: por %xmm2, %xmm0
; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: retq
@@ -3572,8 +3594,9 @@ define void @trunc_usat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
+; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: por %xmm0, %xmm2
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE2-NEXT: packuswb %xmm2, %xmm2
@@ -3587,8 +3610,9 @@ define void @trunc_usat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
; SSSE3-NEXT: pxor %xmm0, %xmm1
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
+; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT: pandn %xmm1, %xmm2
; SSSE3-NEXT: por %xmm0, %xmm2
; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; SSSE3-NEXT: movd %xmm2, (%rdi)
More information about the llvm-commits
mailing list