[llvm] 57b0194 - [X86] IsNOT - fold PCMPGT(C, X) -> PCMPGT(X,C-1)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 27 04:34:09 PDT 2023
Author: Simon Pilgrim
Date: 2023-09-27T12:33:55+01:00
New Revision: 57b0194b696f8fedb53201430d6db1bec313781c
URL: https://github.com/llvm/llvm-project/commit/57b0194b696f8fedb53201430d6db1bec313781c
DIFF: https://github.com/llvm/llvm-project/commit/57b0194b696f8fedb53201430d6db1bec313781c.diff
LOG: [X86] IsNOT - fold PCMPGT(C, X) -> PCMPGT(X,C-1)
To invert the result, we can profitably commute a PCMPGT node if the LHS was a constant (C > min_signed_value): https://alive2.llvm.org/ce/z/LxcPqm
Allows the constant to fold, and helps reduce register pressure
Fixes #67347
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/fpclamptosat_vec.ll
llvm/test/CodeGen/X86/icmp-pow2-diff.ll
llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
llvm/test/CodeGen/X86/pr48215.ll
llvm/test/CodeGen/X86/sat-add.ll
llvm/test/CodeGen/X86/vec_setcc-2.ll
llvm/test/CodeGen/X86/vector-trunc-usat.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d5cbdf596b0929b..08b9098871b354b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4881,9 +4881,10 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
return true;
}
-// Match (xor X, -1) -> X.
-// Match extract_subvector(xor X, -1) -> extract_subvector(X).
-// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
+// Match not(xor X, -1) -> X.
+// Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
+// Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
+// Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
V = peekThroughBitcasts(V);
if (V.getOpcode() == ISD::XOR &&
@@ -4898,6 +4899,29 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
Not, V.getOperand(1));
}
}
+ if (V.getOpcode() == X86ISD::PCMPGT &&
+ !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
+ !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
+ V.getOperand(0).hasOneUse()) {
+ APInt UndefElts;
+ SmallVector<APInt> EltBits;
+ if (getTargetConstantBitsFromNode(V.getOperand(0),
+ V.getScalarValueSizeInBits(), UndefElts,
+ EltBits)) {
+ // Don't fold min_signed_value -> (min_signed_value - 1)
+ bool MinSigned = false;
+ for (APInt &Elt : EltBits) {
+ MinSigned |= Elt.isMinSignedValue();
+ Elt -= 1;
+ }
+ if (!MinSigned) {
+ SDLoc DL(V);
+ MVT VT = V.getSimpleValueType();
+ return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
+ getConstVector(EltBits, UndefElts, VT, DAG, DL));
+ }
+ }
+ }
SmallVector<SDValue, 2> CatOps;
if (collectConcatOps(V.getNode(), CatOps, DAG)) {
for (SDValue &CatOp : CatOps) {
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 845f128ea9412f7..5e4f690fcdbbf69 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -76,12 +76,11 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) {
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
; CHECK-NEXT: pxor %xmm2, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; CHECK-NEXT: pcmpgtd %xmm1, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: pand %xmm3, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pandn %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: pand %xmm2, %xmm0
@@ -731,12 +730,11 @@ define <2 x i16> @utest_f64i16(<2 x double> %x) {
; CHECK-NEXT: orpd %xmm1, %xmm0
; CHECK-NEXT: movapd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-NEXT: xorpd %xmm0, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = <2147549183,2147549183,u,u>
-; CHECK-NEXT: pcmpgtd %xmm1, %xmm2
-; CHECK-NEXT: andpd %xmm2, %xmm0
-; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT: pxor %xmm2, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
+; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pandn %xmm0, %xmm2
+; CHECK-NEXT: psrld $16, %xmm1
+; CHECK-NEXT: por %xmm2, %xmm1
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; CHECK-NEXT: retq
entry:
@@ -801,12 +799,11 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) {
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-NEXT: pxor %xmm0, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
-; CHECK-NEXT: pcmpgtd %xmm1, %xmm2
-; CHECK-NEXT: pand %xmm2, %xmm0
-; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT: pxor %xmm2, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
+; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pandn %xmm0, %xmm2
+; CHECK-NEXT: psrld $16, %xmm1
+; CHECK-NEXT: por %xmm2, %xmm1
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
@@ -1653,12 +1650,11 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) {
; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
; CHECK-NEXT: pxor %xmm2, %xmm0
-; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; CHECK-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; CHECK-NEXT: pcmpgtd %xmm1, %xmm3
-; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT: pand %xmm3, %xmm0
+; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT: pandn %xmm1, %xmm0
; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
; CHECK-NEXT: pxor %xmm0, %xmm1
; CHECK-NEXT: pand %xmm2, %xmm0
@@ -2293,12 +2289,11 @@ define <2 x i16> @utest_f64i16_mm(<2 x double> %x) {
; CHECK-NEXT: orpd %xmm1, %xmm0
; CHECK-NEXT: movapd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-NEXT: xorpd %xmm0, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = <2147549183,2147549183,u,u>
-; CHECK-NEXT: pcmpgtd %xmm1, %xmm2
-; CHECK-NEXT: andpd %xmm2, %xmm0
-; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT: pxor %xmm2, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
+; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pandn %xmm0, %xmm2
+; CHECK-NEXT: psrld $16, %xmm1
+; CHECK-NEXT: por %xmm2, %xmm1
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; CHECK-NEXT: retq
entry:
@@ -2358,12 +2353,11 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) {
; CHECK-NEXT: por %xmm1, %xmm0
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; CHECK-NEXT: pxor %xmm0, %xmm1
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
-; CHECK-NEXT: pcmpgtd %xmm1, %xmm2
-; CHECK-NEXT: pand %xmm2, %xmm0
-; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT: pxor %xmm2, %xmm1
-; CHECK-NEXT: por %xmm0, %xmm1
+; CHECK-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT: movdqa %xmm1, %xmm2
+; CHECK-NEXT: pandn %xmm0, %xmm2
+; CHECK-NEXT: psrld $16, %xmm1
+; CHECK-NEXT: por %xmm2, %xmm1
; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll b/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
index 00038e60628133a..dd6553bec4b1d11 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-
diff .ll
@@ -215,29 +215,28 @@ define <16 x i1> @andnot_ne_v16i8_fail_max_not_n1(<16 x i8> %x) nounwind {
; AVX512-LABEL: andnot_ne_v16i8_fail_max_not_n1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2
+; AVX512-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $54, %xmm2, %xmm1, %xmm0
; AVX512-NEXT: retq
;
; AVX2-LABEL: andnot_ne_v16i8_fail_max_not_n1:
; AVX2: # %bb.0:
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; SSE-LABEL: andnot_ne_v16i8_fail_max_not_n1:
; SSE: # %bb.0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; SSE-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE-NEXT: pandn %xmm2, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
+; SSE-NEXT: movdqa %xmm0, %xmm2
+; SSE-NEXT: pcmpeqb %xmm1, %xmm2
+; SSE-NEXT: pcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT: por %xmm2, %xmm0
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
%cmp1 = icmp ne <16 x i8> %x, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%cmp2 = icmp ne <16 x i8> %x, <i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127, i8 127>
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index e0eca731b144d0b..715df982e1a069c 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -1408,9 +1408,9 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
@@ -1633,9 +1633,9 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [65535,65535,65535,65535]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342]
+; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -1916,9 +1916,9 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776062,9223372036854776062,9223372036854776062,9223372036854776062]
+; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -2028,12 +2028,11 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: pand %xmm0, %xmm3
@@ -2064,13 +2063,11 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295]
-; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: pxor %xmm0, %xmm5
-; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372041149743103,9223372041149743103]
-; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: pxor %xmm2, %xmm0
+; SSE4-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
; SSE4-NEXT: movmskpd %xmm3, %eax
; SSE4-NEXT: xorl $3, %eax
@@ -2096,13 +2093,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
-; AVX1-NEXT: # xmm4 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -2114,12 +2107,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [4294967295,4294967295]
-; AVX2-NEXT: # xmm2 = mem[0,0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
; AVX2-NEXT: retq
@@ -2168,12 +2158,11 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
; SSE2-NEXT: pxor %xmm3, %xmm4
; SSE2-NEXT: pand %xmm0, %xmm3
@@ -2206,13 +2195,11 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm2
; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: movapd {{.*#+}} xmm4 = [65535,65535]
-; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: pxor %xmm0, %xmm5
-; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854841343,9223372036854841343]
-; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,2,2,3]
+; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: pxor %xmm2, %xmm0
+; SSE4-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
; SSE4-NEXT: movmskpd %xmm3, %eax
@@ -2232,64 +2219,31 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi)
; SSE4-NEXT: retq
;
-; AVX1-LABEL: truncstore_v2i64_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535]
-; AVX1-NEXT: # xmm3 = mem[0,0]
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
-; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
-; AVX1-NEXT: # xmm5 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovmskpd %xmm1, %eax
-; AVX1-NEXT: xorl $3, %eax
-; AVX1-NEXT: testb $1, %al
-; AVX1-NEXT: jne .LBB7_1
-; AVX1-NEXT: # %bb.2: # %else
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: jne .LBB7_3
-; AVX1-NEXT: .LBB7_4: # %else2
-; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB7_1: # %cond.store
-; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: je .LBB7_4
-; AVX1-NEXT: .LBB7_3: # %cond.store1
-; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: truncstore_v2i64_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [65535,65535]
-; AVX2-NEXT: # xmm3 = mem[0,0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343]
-; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovmskpd %xmm1, %eax
-; AVX2-NEXT: xorl $3, %eax
-; AVX2-NEXT: testb $1, %al
-; AVX2-NEXT: jne .LBB7_1
-; AVX2-NEXT: # %bb.2: # %else
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: jne .LBB7_3
-; AVX2-NEXT: .LBB7_4: # %else2
-; AVX2-NEXT: retq
-; AVX2-NEXT: .LBB7_1: # %cond.store
-; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: je .LBB7_4
-; AVX2-NEXT: .LBB7_3: # %cond.store1
-; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: truncstore_v2i64_v2i16:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX-NEXT: vblendvpd %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vmovmskpd %xmm1, %eax
+; AVX-NEXT: xorl $3, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: jne .LBB7_1
+; AVX-NEXT: # %bb.2: # %else
+; AVX-NEXT: testb $2, %al
+; AVX-NEXT: jne .LBB7_3
+; AVX-NEXT: .LBB7_4: # %else2
+; AVX-NEXT: retq
+; AVX-NEXT: .LBB7_1: # %cond.store
+; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX-NEXT: testb $2, %al
+; AVX-NEXT: je .LBB7_4
+; AVX-NEXT: .LBB7_3: # %cond.store1
+; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i16:
; AVX512F: # %bb.0:
@@ -2347,12 +2301,11 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: pandn %xmm4, %xmm3
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; SSE2-NEXT: por %xmm0, %xmm3
@@ -2383,16 +2336,14 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-LABEL: truncstore_v2i64_v2i8:
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pxor %xmm4, %xmm4
-; SSE4-NEXT: movapd {{.*#+}} xmm3 = [255,255]
-; SSE4-NEXT: movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: pxor %xmm0, %xmm5
-; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854776063,9223372036854776063]
-; SSE4-NEXT: pcmpgtq %xmm5, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE4-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE4-NEXT: pcmpeqq %xmm1, %xmm4
-; SSE4-NEXT: movmskpd %xmm4, %eax
+; SSE4-NEXT: pxor %xmm3, %xmm3
+; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE4-NEXT: pxor %xmm2, %xmm0
+; SSE4-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
+; SSE4-NEXT: movmskpd %xmm3, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB8_1
@@ -2402,69 +2353,37 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-NEXT: .LBB8_4: # %else2
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB8_1: # %cond.store
-; SSE4-NEXT: pextrb $0, %xmm3, (%rdi)
+; SSE4-NEXT: pextrb $0, %xmm2, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB8_4
; SSE4-NEXT: .LBB8_3: # %cond.store1
-; SSE4-NEXT: pextrb $1, %xmm3, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi)
; SSE4-NEXT: retq
;
-; AVX1-LABEL: truncstore_v2i64_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [255,255]
-; AVX1-NEXT: # xmm3 = mem[0,0]
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
-; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
-; AVX1-NEXT: # xmm5 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovmskpd %xmm1, %eax
-; AVX1-NEXT: xorl $3, %eax
-; AVX1-NEXT: testb $1, %al
-; AVX1-NEXT: jne .LBB8_1
-; AVX1-NEXT: # %bb.2: # %else
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: jne .LBB8_3
-; AVX1-NEXT: .LBB8_4: # %else2
-; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB8_1: # %cond.store
-; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi)
-; AVX1-NEXT: testb $2, %al
-; AVX1-NEXT: je .LBB8_4
-; AVX1-NEXT: .LBB8_3: # %cond.store1
-; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: truncstore_v2i64_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [255,255]
-; AVX2-NEXT: # xmm3 = mem[0,0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063]
-; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4
-; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovmskpd %xmm1, %eax
-; AVX2-NEXT: xorl $3, %eax
-; AVX2-NEXT: testb $1, %al
-; AVX2-NEXT: jne .LBB8_1
-; AVX2-NEXT: # %bb.2: # %else
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: jne .LBB8_3
-; AVX2-NEXT: .LBB8_4: # %else2
-; AVX2-NEXT: retq
-; AVX2-NEXT: .LBB8_1: # %cond.store
-; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi)
-; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: je .LBB8_4
-; AVX2-NEXT: .LBB8_3: # %cond.store1
-; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: truncstore_v2i64_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
+; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX-NEXT: vblendvpd %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vmovmskpd %xmm1, %eax
+; AVX-NEXT: xorl $3, %eax
+; AVX-NEXT: testb $1, %al
+; AVX-NEXT: jne .LBB8_1
+; AVX-NEXT: # %bb.2: # %else
+; AVX-NEXT: testb $2, %al
+; AVX-NEXT: jne .LBB8_3
+; AVX-NEXT: .LBB8_4: # %else2
+; AVX-NEXT: retq
+; AVX-NEXT: .LBB8_1: # %cond.store
+; AVX-NEXT: vpextrb $0, %xmm0, (%rdi)
+; AVX-NEXT: testb $2, %al
+; AVX-NEXT: je .LBB8_4
+; AVX-NEXT: .LBB8_3: # %cond.store1
+; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: truncstore_v2i64_v2i8:
; AVX512F: # %bb.0:
@@ -4719,13 +4638,12 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrld $16, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
@@ -4946,18 +4864,18 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: packuswb %xmm4, %xmm4
-; SSE2-NEXT: packuswb %xmm4, %xmm4
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE2-NEXT: movdqa %xmm3, %xmm4
+; SSE2-NEXT: pandn %xmm0, %xmm4
+; SSE2-NEXT: psrld $24, %xmm3
+; SSE2-NEXT: por %xmm4, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm3
+; SSE2-NEXT: packuswb %xmm3, %xmm3
; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
; SSE2-NEXT: movmskps %xmm2, %ecx
; SSE2-NEXT: xorl $15, %ecx
; SSE2-NEXT: testb $1, %cl
-; SSE2-NEXT: movd %xmm4, %eax
+; SSE2-NEXT: movd %xmm3, %eax
; SSE2-NEXT: jne .LBB14_1
; SSE2-NEXT: # %bb.2: # %else
; SSE2-NEXT: testb $2, %cl
diff --git a/llvm/test/CodeGen/X86/pr48215.ll b/llvm/test/CodeGen/X86/pr48215.ll
index de599c7468d3f34..02669a6212d87c3 100644
--- a/llvm/test/CodeGen/X86/pr48215.ll
+++ b/llvm/test/CodeGen/X86/pr48215.ll
@@ -35,12 +35,12 @@ define i32 @PR48215(i32 %a0, i32 %a1) {
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0
; AVX2-NEXT: vmovd %edx, %xmm1
; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpcmpgtd %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,7]
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-NEXT: vmovmskps %ymm0, %ecx
-; AVX2-NEXT: vmovmskps %xmm1, %eax
+; AVX2-NEXT: xorl $255, %ecx
+; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX2-NEXT: vmovmskps %xmm0, %eax
+; AVX2-NEXT: xorl $15, %eax
; AVX2-NEXT: addl %ecx, %eax
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index aad13f0e829581e..c34ee3a7b786ab5 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -472,12 +472,13 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_min(<4 x i32> %x) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483605,2147483605,2147483605,2147483605]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: unsigned_sat_constant_v4i32_using_min:
@@ -644,24 +645,19 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
; SSE42-LABEL: unsigned_sat_constant_v2i64_using_min:
; SSE42: # %bb.0:
; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551573,18446744073709551573]
-; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: pxor %xmm0, %xmm3
-; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775765,9223372036854775765]
-; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE42-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; SSE42-NEXT: pxor %xmm1, %xmm0
+; SSE42-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE42-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE42-NEXT: movdqa %xmm1, %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_min:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573]
-; AVX2-NEXT: # xmm1 = mem[0,0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765]
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vec_setcc-2.ll b/llvm/test/CodeGen/X86/vec_setcc-2.ll
index 367f9f21041a559..531efa2476f68c1 100644
--- a/llvm/test/CodeGen/X86/vec_setcc-2.ll
+++ b/llvm/test/CodeGen/X86/vec_setcc-2.ll
@@ -303,14 +303,13 @@ define <2 x i1> @uge_v2i64_splat(<2 x i64> %x) {
; SSE2-LABEL: uge_v2i64_splat:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147484090,2147484090,2147484090,2147484090]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pandn %xmm1, %xmm2
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm2, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: uge_v2i64_splat:
@@ -377,12 +376,11 @@ define <2 x i1> @ult_v2i64_splat(<2 x i64> %x) {
; SSE2-LABEL: ult_v2i64_splat:
; SSE2: ## %bb.0:
; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147484090,2147484090,2147484090,2147484090]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT: pandn %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: ult_v2i64_splat:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index e717ede16a9f071..f40a7e39b986999 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -24,12 +24,11 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
@@ -41,39 +40,23 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE41-NEXT: retq
;
-; AVX1-LABEL: trunc_usat_v2i64_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
-; AVX1-NEXT: # xmm3 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v2i64_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX2-NEXT: # xmm1 = mem[0,0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: retq
+; AVX-LABEL: trunc_usat_v2i64_v2i32:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v2i64_v2i32:
; AVX512F: # %bb.0:
@@ -116,12 +99,11 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647]
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
@@ -134,42 +116,25 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE41-NEXT: movq %xmm0, (%rdi)
; SSE41-NEXT: retq
;
-; AVX1-LABEL: trunc_usat_v2i64_v2i32_store:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
-; AVX1-NEXT: # xmm3 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vmovlpd %xmm0, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v2i64_v2i32_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [4294967295,4294967295]
-; AVX2-NEXT: # xmm1 = mem[0,0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-NEXT: vmovlpd %xmm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: trunc_usat_v2i64_v2i32_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX-NEXT: vmovlpd %xmm0, (%rdi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v2i64_v2i32_store:
; AVX512F: # %bb.0:
@@ -289,10 +254,9 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-SLOW-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
-; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
+; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
@@ -302,10 +266,9 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-FAST-ALL: # %bb.0:
; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-FAST-ALL-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
-; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
-; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
+; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -317,10 +280,9 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-FAST-PERLANE: # %bb.0:
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-FAST-PERLANE-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
-; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
+; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-FAST-PERLANE-NEXT: vzeroupper
@@ -600,12 +562,11 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
@@ -618,13 +579,12 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -632,37 +592,27 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX1-LABEL: trunc_usat_v2i64_v2i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
-; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
-; AVX1-NEXT: # xmm3 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
-; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
-; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
-; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-SLOW-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
-; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
-; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
-; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-FAST-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: retq
;
@@ -705,12 +655,11 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-SSSE3-NEXT: pand %xmm3, %xmm1
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-SSSE3-NEXT: pandn %xmm2, %xmm1
; SSE2-SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm2
; SSE2-SSSE3-NEXT: pand %xmm0, %xmm1
@@ -724,13 +673,12 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
@@ -739,13 +687,9 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
;
; AVX1-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
-; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
-; AVX1-NEXT: # xmm3 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vmovd %xmm0, (%rdi)
@@ -753,12 +697,9 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
;
; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
-; AVX2-SLOW-NEXT: # xmm1 = mem[0,0]
-; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
-; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-SLOW-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
@@ -766,12 +707,9 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
;
; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [65535,65535]
-; AVX2-FAST-NEXT: # xmm1 = mem[0,0]
-; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343]
-; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-FAST-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-FAST-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi)
; AVX2-FAST-NEXT: retq
@@ -901,9 +839,9 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) {
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [65535,65535,65535,65535]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342]
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -1036,9 +974,9 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [65535,65535,65535,65535]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342]
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -1275,13 +1213,12 @@ define <4 x i16> @trunc_usat_v4i32_v4i16(<4 x i32> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: retq
@@ -1290,13 +1227,13 @@ define <4 x i16> @trunc_usat_v4i32_v4i16(<4 x i32> %a0) {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: psrld $16, %xmm1
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v4i32_v4i16:
@@ -1359,13 +1296,12 @@ define void @trunc_usat_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,2,2,3,4,5,6,7]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm2
+; SSE2-NEXT: pandn %xmm0, %xmm2
+; SSE2-NEXT: psrld $16, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE2-NEXT: movq %xmm0, (%rdi)
@@ -1375,14 +1311,13 @@ define void @trunc_usat_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movq %xmm2, (%rdi)
+; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSSE3-NEXT: psrld $16, %xmm1
+; SSSE3-NEXT: por %xmm2, %xmm1
+; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT: movq %xmm1, (%rdi)
; SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v4i32_v4i16_store:
@@ -1712,12 +1647,11 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: por %xmm1, %xmm0
@@ -1730,12 +1664,11 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm1, %xmm0
@@ -1747,40 +1680,24 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: movdqa %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: trunc_usat_v2i64_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
-; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
-; AVX1-NEXT: # xmm3 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v2i64_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
-; AVX2-NEXT: # xmm1 = mem[0,0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: retq
+; AVX-LABEL: trunc_usat_v2i64_v2i8:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v2i64_v2i8:
; AVX512F: # %bb.0:
@@ -1821,12 +1738,11 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm1
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-NEXT: pandn %xmm2, %xmm1
; SSE2-NEXT: pand %xmm1, %xmm0
; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; SSE2-NEXT: por %xmm0, %xmm1
@@ -1841,12 +1757,11 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
; SSSE3: # %bb.0:
; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
; SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
+; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSSE3-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
+; SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSSE3-NEXT: pandn %xmm2, %xmm1
; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2
; SSSE3-NEXT: pxor %xmm1, %xmm2
; SSSE3-NEXT: pand %xmm0, %xmm1
@@ -1860,42 +1775,25 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903,2147483903,2147483903]
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT: pand %xmm4, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
+; SSE41-NEXT: pxor %xmm0, %xmm3
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE41-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT: pandn %xmm3, %xmm0
; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; SSE41-NEXT: pextrw $0, %xmm2, (%rdi)
; SSE41-NEXT: retq
;
-; AVX1-LABEL: trunc_usat_v2i64_v2i8_store:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
-; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
-; AVX1-NEXT: # xmm3 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v2i64_v2i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [255,255]
-; AVX2-NEXT: # xmm1 = mem[0,0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063]
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi)
-; AVX2-NEXT: retq
+; AVX-LABEL: trunc_usat_v2i64_v2i8_store:
+; AVX: # %bb.0:
+; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
+; AVX-NEXT: retq
;
; AVX512F-LABEL: trunc_usat_v2i64_v2i8_store:
; AVX512F: # %bb.0:
@@ -2022,9 +1920,9 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [255,255,255,255]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854776062,9223372036854776062,9223372036854776062,9223372036854776062]
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -2159,9 +2057,9 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [255,255,255,255]
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854776062,9223372036854776062,9223372036854776062,9223372036854776062]
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -2906,13 +2804,14 @@ define <4 x i8> @trunc_usat_v4i32_v4i8(<4 x i32> %a0) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-SSSE3-NEXT: por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0
+; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: psrld $24, %xmm1
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0
; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v4i32_v4i8:
@@ -2983,14 +2882,14 @@ define void @trunc_usat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
; SSE2-SSSE3: # %bb.0:
; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm1
-; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE2-SSSE3-NEXT: por %xmm0, %xmm2
-; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm2
-; SSE2-SSSE3-NEXT: packuswb %xmm2, %xmm2
-; SSE2-SSSE3-NEXT: movd %xmm2, (%rdi)
+; SSE2-SSSE3-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2
+; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm2
+; SSE2-SSSE3-NEXT: psrld $24, %xmm1
+; SSE2-SSSE3-NEXT: por %xmm2, %xmm1
+; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1
+; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi)
; SSE2-SSSE3-NEXT: retq
;
; SSE41-LABEL: trunc_usat_v4i32_v4i8_store:
More information about the llvm-commits
mailing list