[llvm] r324842 - [X86] Use min/max for vector ult/ugt compares if avoids a sign flip.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sun Feb 11 09:11:40 PST 2018
Author: ctopper
Date: Sun Feb 11 09:11:40 2018
New Revision: 324842
URL: http://llvm.org/viewvc/llvm-project?rev=324842&view=rev
Log:
[X86] Use min/max for vector ult/ugt compares if avoids a sign flip.
Summary:
Currently we only use min/max to help with ule/uge compares because it removes an invert of the result that would otherwise be needed. But we can also use it for ult/ugt compares if it will prevent the need for a sign bit flip needed to use pcmpgt at the cost of requiring an invert after the compare.
I also refactored the code so that the max/min code is self contained and does its own return instead of setting up a flag to manipulate the rest of the function's behavior.
Most of the test cases look ok with this. I did notice that we added instructions when one of the operands being sign flipped is a constant vector that we were able to constant fold the flip into.
I also noticed that sometimes the SSE min/max clobbers a register that is needed after the compare. This resulted in an extra move being inserted before the min/max to preserve the register. We could try to detect this and switch from min to max and change the compare operands to use the operand that gets reused in the compare.
Reviewers: spatel, RKSimon
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D42935
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
llvm/trunk/test/CodeGen/X86/psubus.ll
llvm/trunk/test/CodeGen/X86/vec_cmp_uint-128.ll
llvm/trunk/test/CodeGen/X86/vec_minmax_match.ll
llvm/trunk/test/CodeGen/X86/vec_setcc-2.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=324842&r1=324841&r2=324842&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Feb 11 09:11:40 2018
@@ -18071,16 +18071,6 @@ static SDValue LowerVSETCC(SDValue Op, c
}
}
- // We are handling one of the integer comparisons here. Since SSE only has
- // GT and EQ comparisons for integer, swapping operands and multiple
- // operations may be required for some comparisons.
- unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
- : X86ISD::PCMPGT;
- bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
- Cond == ISD::SETGE || Cond == ISD::SETUGE;
- bool Invert = Cond == ISD::SETNE ||
- (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
-
// If both operands are known non-negative, then an unsigned compare is the
// same as a signed compare and there's no need to flip signbits.
// TODO: We could check for more general simplifications here since we're
@@ -18088,27 +18078,47 @@ static SDValue LowerVSETCC(SDValue Op, c
bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
- // Special case: Use min/max operations for SETULE/SETUGE
- MVT VET = VT.getVectorElementType();
- bool HasMinMax =
- (Subtarget.hasAVX512() && VET == MVT::i64) ||
- (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
- (Subtarget.hasSSE2() && (VET == MVT::i8));
- bool MinMax = false;
- if (HasMinMax) {
+ // Special case: Use min/max operations for unsigned compares. We only want
+ // to do this for unsigned compares if we need to flip signs or if it allows
+ // use to avoid an invert.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (ISD::isUnsignedIntSetCC(Cond) &&
+ (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
+ TLI.isOperationLegal(ISD::UMIN, VT)) {
+ bool Invert = false;
+ unsigned Opc;
switch (Cond) {
- default: break;
- case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
- case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
+ default: llvm_unreachable("Unexpected condition code");
+ case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETULE: Opc = ISD::UMIN; break;
+ case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
+ case ISD::SETUGE: Opc = ISD::UMAX; break;
}
- if (MinMax)
- Swap = Invert = FlipSigns = false;
+ SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+ Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
+
+ // If the logical-not of the result is required, perform that now.
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, VT);
+
+ return Result;
}
+ // We are handling one of the integer comparisons here. Since SSE only has
+ // GT and EQ comparisons for integer, swapping operands and multiple
+ // operations may be required for some comparisons.
+ unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
+ : X86ISD::PCMPGT;
+ bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
+ Cond == ISD::SETGE || Cond == ISD::SETUGE;
+ bool Invert = Cond == ISD::SETNE ||
+ (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
+
+ MVT VET = VT.getVectorElementType();
bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
bool Subus = false;
- if (!MinMax && HasSubus) {
+ if (HasSubus) {
// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
// Op0 u<= Op1:
// t = psubus Op0, Op1
@@ -18227,9 +18237,6 @@ static SDValue LowerVSETCC(SDValue Op, c
if (Invert)
Result = DAG.getNOT(dl, Result, VT);
- if (MinMax)
- Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
-
if (Subus)
Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
getZeroVector(VT, Subtarget, DAG, dl));
Modified: llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll?rev=324842&r1=324841&r2=324842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-insert-extract.ll Sun Feb 11 09:11:40 2018
@@ -989,11 +989,11 @@ define zeroext i8 @test_extractelement_v
define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
; KNL-LABEL: test_extractelement_v32i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
+; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $2, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -1018,12 +1018,12 @@ define zeroext i8 @test_extractelement_v
define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: test_extractelement_v64i1:
; KNL: ## %bb.0:
-; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; KNL-NEXT: vpxor %ymm0, %ymm3, %ymm2
-; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
+; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
+; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -1054,12 +1054,12 @@ define zeroext i8 @test_extractelement_v
define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
; KNL-LABEL: extractelement_v64i1_alt:
; KNL: ## %bb.0:
-; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; KNL-NEXT: vpxor %ymm0, %ymm3, %ymm2
-; KNL-NEXT: vpxor %ymm0, %ymm1, %ymm0
-; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm0
+; KNL-NEXT: vpcmpeqb %ymm0, %ymm1, %ymm0
+; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpslld $31, %zmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
; KNL-NEXT: kshiftrw $15, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
@@ -1645,10 +1645,9 @@ define zeroext i8 @test_extractelement_v
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: ## kill: def $edi killed $edi def $rdi
-; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
+; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
; KNL-NEXT: andl $31, %edi
; KNL-NEXT: movzbl (%rsp,%rdi), %eax
@@ -1707,9 +1706,10 @@ define i32 @test_insertelement_variable_
; KNL-NEXT: andq $-32, %rsp
; KNL-NEXT: subq $64, %rsp
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
-; KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; KNL-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; KNL-NEXT: vpminub %ymm1, %ymm0, %ymm1
+; KNL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: andl $31, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm0, (%rsp)
@@ -1771,11 +1771,13 @@ define i64 @test_insertelement_variable_
; KNL-NEXT: andq $-64, %rsp
; KNL-NEXT: subq $128, %rsp
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
-; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; KNL-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vpcmpgtb %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; KNL-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; KNL-NEXT: vpminub %ymm2, %ymm0, %ymm3
+; KNL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0
+; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vpminub %ymm2, %ymm1, %ymm2
+; KNL-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
; KNL-NEXT: andl $63, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%rsp)
@@ -1952,14 +1954,16 @@ define i96 @test_insertelement_variable_
; KNL-NEXT: vpinsrb $14, 208(%rbp), %xmm3, %xmm3
; KNL-NEXT: vpinsrb $15, 216(%rbp), %xmm3, %xmm3
; KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; KNL-NEXT: vpxor %ymm3, %ymm2, %ymm2
-; KNL-NEXT: vpcmpgtb %ymm3, %ymm2, %ymm2
-; KNL-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; KNL-NEXT: vpcmpgtb %ymm3, %ymm1, %ymm1
-; KNL-NEXT: vpxor %ymm3, %ymm0, %ymm0
-; KNL-NEXT: vpcmpgtb %ymm3, %ymm0, %ymm0
; KNL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; KNL-NEXT: vpminub %ymm3, %ymm2, %ymm4
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
+; KNL-NEXT: vpminub %ymm3, %ymm1, %ymm4
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
+; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vpminub %ymm3, %ymm0, %ymm4
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
+; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; KNL-NEXT: movl 744(%rbp), %eax
; KNL-NEXT: andl $127, %eax
; KNL-NEXT: cmpb $0, 736(%rbp)
@@ -2173,15 +2177,19 @@ define i128 @test_insertelement_variable
; KNL-NEXT: andq $-128, %rsp
; KNL-NEXT: subq $256, %rsp ## imm = 0x100
; KNL-NEXT: ## kill: def $esi killed $esi def $rsi
-; KNL-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; KNL-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; KNL-NEXT: vpcmpgtb %ymm4, %ymm0, %ymm0
-; KNL-NEXT: vpxor %ymm4, %ymm1, %ymm1
-; KNL-NEXT: vpcmpgtb %ymm4, %ymm1, %ymm1
-; KNL-NEXT: vpxor %ymm4, %ymm2, %ymm2
-; KNL-NEXT: vpcmpgtb %ymm4, %ymm2, %ymm2
-; KNL-NEXT: vpxor %ymm4, %ymm3, %ymm3
-; KNL-NEXT: vpcmpgtb %ymm4, %ymm3, %ymm3
+; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; KNL-NEXT: vpminub %ymm4, %ymm0, %ymm5
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0
+; KNL-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; KNL-NEXT: vpminub %ymm4, %ymm1, %ymm5
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm1
+; KNL-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
+; KNL-NEXT: vpminub %ymm4, %ymm2, %ymm5
+; KNL-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm2
+; KNL-NEXT: vpternlogq $15, %zmm2, %zmm2, %zmm2
+; KNL-NEXT: vpminub %ymm4, %ymm3, %ymm4
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
+; KNL-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
; KNL-NEXT: andl $127, %esi
; KNL-NEXT: testb %dil, %dil
; KNL-NEXT: vmovdqa %ymm3, {{[0-9]+}}(%rsp)
Modified: llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll?rev=324842&r1=324841&r2=324842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll Sun Feb 11 09:11:40 2018
@@ -16294,11 +16294,11 @@ define zeroext i32 @test_vpcmpultb_v16i1
;
; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vzeroupper
@@ -16321,11 +16321,11 @@ define zeroext i32 @test_vpcmpultb_v16i1
;
; NoVLX-LABEL: test_vpcmpultb_v16i1_v32i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxub (%rdi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vzeroupper
@@ -16350,11 +16350,11 @@ define zeroext i32 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
@@ -16381,11 +16381,11 @@ define zeroext i32 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v32i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxub (%rsi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
@@ -16413,11 +16413,11 @@ define zeroext i64 @test_vpcmpultb_v16i1
;
; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzwl %ax, %eax
@@ -16441,11 +16441,11 @@ define zeroext i64 @test_vpcmpultb_v16i1
;
; NoVLX-LABEL: test_vpcmpultb_v16i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxub (%rdi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzwl %ax, %eax
@@ -16471,11 +16471,11 @@ define zeroext i64 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
@@ -16502,11 +16502,11 @@ define zeroext i64 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultb_v16i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxub (%rsi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
@@ -16535,15 +16535,16 @@ define zeroext i64 @test_vpcmpultb_v32i1
;
; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: shll $16, %eax
@@ -16569,15 +16570,16 @@ define zeroext i64 @test_vpcmpultb_v32i1
;
; NoVLX-LABEL: test_vpcmpultb_v32i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxub (%rdi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: shll $16, %eax
@@ -16605,17 +16607,18 @@ define zeroext i64 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
; NoVLX-NEXT: shrl $16, %edi
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: andl %edi, %ecx
@@ -16646,17 +16649,18 @@ define zeroext i64 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultb_v32i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxub (%rsi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
; NoVLX-NEXT: shrl $16, %edi
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm0
; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: andl %edi, %ecx
@@ -16688,11 +16692,11 @@ define zeroext i16 @test_vpcmpultw_v8i1_
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -16717,11 +16721,11 @@ define zeroext i16 @test_vpcmpultw_v8i1_
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v16i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: # kill: def $ax killed $ax killed $eax
@@ -16748,11 +16752,11 @@ define zeroext i16 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -16781,11 +16785,11 @@ define zeroext i16 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v16i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw (%rsi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -16814,11 +16818,11 @@ define zeroext i32 @test_vpcmpultw_v8i1_
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vzeroupper
@@ -16841,11 +16845,11 @@ define zeroext i32 @test_vpcmpultw_v8i1_
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v32i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vzeroupper
@@ -16870,11 +16874,11 @@ define zeroext i32 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -16901,11 +16905,11 @@ define zeroext i32 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v32i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw (%rsi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -16933,11 +16937,11 @@ define zeroext i64 @test_vpcmpultw_v8i1_
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzwl %ax, %eax
@@ -16961,11 +16965,11 @@ define zeroext i64 @test_vpcmpultw_v8i1_
;
; NoVLX-LABEL: test_vpcmpultw_v8i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rdi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw (%rdi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzwl %ax, %eax
@@ -16991,11 +16995,11 @@ define zeroext i64 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -17023,11 +17027,11 @@ define zeroext i64 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v8i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; NoVLX-NEXT: vpxor (%rsi), %xmm1, %xmm1
-; NoVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
+; NoVLX-NEXT: vpmaxuw (%rsi), %xmm0, %xmm1
+; NoVLX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwq %xmm0, %zmm0
+; NoVLX-NEXT: vpsllq $63, %zmm0, %zmm0
; NoVLX-NEXT: kmovw %edi, %k1
; NoVLX-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; NoVLX-NEXT: kmovw %k0, %eax
@@ -17057,11 +17061,11 @@ define zeroext i32 @test_vpcmpultw_v16i1
;
; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vzeroupper
@@ -17085,11 +17089,11 @@ define zeroext i32 @test_vpcmpultw_v16i1
;
; NoVLX-LABEL: test_vpcmpultw_v16i1_v32i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: vzeroupper
@@ -17115,11 +17119,11 @@ define zeroext i32 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
@@ -17147,11 +17151,11 @@ define zeroext i32 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v32i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw (%rsi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
@@ -17180,11 +17184,11 @@ define zeroext i64 @test_vpcmpultw_v16i1
;
; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzwl %ax, %eax
@@ -17209,11 +17213,11 @@ define zeroext i64 @test_vpcmpultw_v16i1
;
; NoVLX-LABEL: test_vpcmpultw_v16i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rdi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: movzwl %ax, %eax
@@ -17240,11 +17244,11 @@ define zeroext i64 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
@@ -17272,11 +17276,11 @@ define zeroext i64 @test_masked_vpcmpult
;
; NoVLX-LABEL: test_masked_vpcmpultw_v16i1_v64i1_mask_mem:
; NoVLX: # %bb.0: # %entry
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw (%rsi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
@@ -17475,17 +17479,18 @@ define zeroext i64 @test_vpcmpultw_v32i1
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
; NoVLX-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm4 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor %ymm4, %ymm2, %ymm2
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0
+; NoVLX-NEXT: vpmaxuw %ymm2, %ymm0, %ymm2
+; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpxor %ymm4, %ymm1, %ymm0
-; NoVLX-NEXT: vpxor %ymm4, %ymm3, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw %ymm3, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: shll $16, %eax
@@ -17592,21 +17597,22 @@ define zeroext i64 @test_vpcmpultw_v32i1
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; NoVLX-NEXT: shrq $48, %rax
; NoVLX-NEXT: vpinsrw $7, %eax, %xmm0, %xmm0
; NoVLX-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rdi), %ymm2, %ymm3
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm3, %ymm0
+; NoVLX-NEXT: vpmaxuw (%rdi), %ymm0, %ymm2
+; NoVLX-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm0
-; NoVLX-NEXT: vpxor 32(%rdi), %ymm2, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw 32(%rdi), %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: shll $16, %eax
@@ -17641,17 +17647,17 @@ define zeroext i64 @test_masked_vpcmpult
; NoVLX-NEXT: movq %rax, %rdx
; NoVLX-NEXT: vmovd %eax, %xmm4
; NoVLX-NEXT: shrl $16, %eax
-; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm5
+; NoVLX-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
; NoVLX-NEXT: vmovq %xmm9, %rax
-; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm4
+; NoVLX-NEXT: vextracti32x4 $2, %zmm1, %xmm5
; NoVLX-NEXT: vextracti32x4 $3, %zmm1, %xmm7
; NoVLX-NEXT: vextracti128 $1, %ymm0, %xmm6
; NoVLX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm3, %rdx
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm3
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm3
; NoVLX-NEXT: movl %edx, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3
@@ -17664,14 +17670,14 @@ define zeroext i64 @test_masked_vpcmpult
; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm10
; NoVLX-NEXT: movl %ecx, %edx
; NoVLX-NEXT: shrl $16, %edx
-; NoVLX-NEXT: vmovd %ecx, %xmm5
-; NoVLX-NEXT: vpinsrw $1, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vmovd %ecx, %xmm4
+; NoVLX-NEXT: vpinsrw $1, %edx, %xmm4, %xmm4
; NoVLX-NEXT: movq %rcx, %rdx
; NoVLX-NEXT: shrq $32, %rdx
-; NoVLX-NEXT: vpinsrw $2, %edx, %xmm5, %xmm5
+; NoVLX-NEXT: vpinsrw $2, %edx, %xmm4, %xmm4
; NoVLX-NEXT: vpextrq $1, %xmm2, %rdx
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm5, %xmm2
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm4, %xmm2
; NoVLX-NEXT: movl %edx, %ecx
; NoVLX-NEXT: shrl $16, %ecx
; NoVLX-NEXT: vpinsrw $4, %edx, %xmm2, %xmm2
@@ -17681,7 +17687,7 @@ define zeroext i64 @test_masked_vpcmpult
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: vmovq %xmm6, %rcx
; NoVLX-NEXT: shrq $48, %rdx
-; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm5
+; NoVLX-NEXT: vpinsrw $7, %edx, %xmm2, %xmm4
; NoVLX-NEXT: movl %ecx, %edx
; NoVLX-NEXT: shrl $16, %edx
; NoVLX-NEXT: vmovd %ecx, %xmm2
@@ -17739,7 +17745,7 @@ define zeroext i64 @test_masked_vpcmpult
; NoVLX-NEXT: movq %rdx, %rcx
; NoVLX-NEXT: shrq $32, %rcx
; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm0, %xmm0
-; NoVLX-NEXT: vmovq %xmm4, %rcx
+; NoVLX-NEXT: vmovq %xmm5, %rcx
; NoVLX-NEXT: shrq $48, %rdx
; NoVLX-NEXT: vpinsrw $7, %edx, %xmm0, %xmm0
; NoVLX-NEXT: movl %ecx, %edx
@@ -17788,35 +17794,36 @@ define zeroext i64 @test_masked_vpcmpult
; NoVLX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
; NoVLX-NEXT: shrq $48, %rdx
; NoVLX-NEXT: vpinsrw $7, %edx, %xmm1, %xmm1
-; NoVLX-NEXT: vpextrq $1, %xmm4, %rax
+; NoVLX-NEXT: vpextrq $1, %xmm5, %rax
; NoVLX-NEXT: vinserti128 $1, %xmm6, %ymm8, %ymm3
; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm3
-; NoVLX-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm3, %ymm1, %ymm1
-; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
-; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
+; NoVLX-NEXT: vpmaxuw %ymm1, %ymm3, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm3, %ymm1
; NoVLX-NEXT: shrq $48, %rcx
-; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm1
+; NoVLX-NEXT: vpinsrw $3, %ecx, %xmm7, %xmm2
; NoVLX-NEXT: movl %eax, %ecx
; NoVLX-NEXT: shrl $16, %ecx
-; NoVLX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $4, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vpinsrw $5, %ecx, %xmm2, %xmm2
; NoVLX-NEXT: movq %rax, %rcx
; NoVLX-NEXT: shrq $32, %rcx
-; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm1, %xmm1
+; NoVLX-NEXT: vpinsrw $6, %ecx, %xmm2, %xmm2
+; NoVLX-NEXT: vpternlogq $15, %zmm1, %zmm1, %zmm1
+; NoVLX-NEXT: vpmovsxwd %ymm1, %zmm1
+; NoVLX-NEXT: vpslld $31, %zmm1, %zmm1
+; NoVLX-NEXT: vptestmd %zmm1, %zmm1, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: andl %edi, %ecx
; NoVLX-NEXT: shrl $16, %edi
; NoVLX-NEXT: shrq $48, %rax
-; NoVLX-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm3
-; NoVLX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1
-; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; NoVLX-NEXT: vpxor %ymm2, %ymm3, %ymm1
-; NoVLX-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; NoVLX-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm10, %ymm4, %ymm1
+; NoVLX-NEXT: vpinsrw $7, %eax, %xmm2, %xmm2
+; NoVLX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
+; NoVLX-NEXT: vpmaxuw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpcmpeqw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %edx
; NoVLX-NEXT: andl %edi, %edx
@@ -17929,22 +17936,23 @@ define zeroext i64 @test_masked_vpcmpult
; NoVLX-NEXT: shrq $48, %rcx
; NoVLX-NEXT: vpinsrw $7, %ecx, %xmm0, %xmm0
; NoVLX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; NoVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor (%rsi), %ymm1, %ymm4
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm4, %ymm0
+; NoVLX-NEXT: vpmaxuw (%rsi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
; NoVLX-NEXT: shrq $48, %rdx
-; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3
+; NoVLX-NEXT: vpinsrw $7, %edx, %xmm3, %xmm1
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %eax
; NoVLX-NEXT: andl %edi, %eax
; NoVLX-NEXT: shrl $16, %edi
-; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm0
-; NoVLX-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; NoVLX-NEXT: vpxor 32(%rsi), %ymm1, %ymm1
-; NoVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0
+; NoVLX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm0
+; NoVLX-NEXT: vpmaxuw 32(%rsi), %ymm0, %ymm1
+; NoVLX-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; NoVLX-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
; NoVLX-NEXT: vpmovsxwd %ymm0, %zmm0
+; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0
; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0
; NoVLX-NEXT: kmovw %k0, %ecx
; NoVLX-NEXT: andl %edi, %ecx
Modified: llvm/trunk/test/CodeGen/X86/psubus.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/psubus.ll?rev=324842&r1=324841&r2=324842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/psubus.ll (original)
+++ llvm/trunk/test/CodeGen/X86/psubus.ll Sun Feb 11 09:11:40 2018
@@ -214,12 +214,14 @@ define <16 x i16> @test8(<16 x i16> %x)
; AVX1-LABEL: test8:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
-; AVX1-NEXT: vpcmpgtw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtw %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32766,32766,32766,32766,32766,32766,32766,32766]
+; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
; AVX1-NEXT: vpaddw %xmm3, %xmm1, %xmm1
@@ -338,12 +340,14 @@ define <32 x i8> @test11(<32 x i8> %x) n
; AVX1-LABEL: test11:
; AVX1: # %bb.0: # %vector.ph
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
-; AVX1-NEXT: vpcmpgtb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [126,126,126,126,126,126,126,126,126,126,126,126,126,126,126,126]
+; AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
@@ -496,22 +500,23 @@ define <8 x i16> @test13(<8 x i16> %x, <
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: psubd %xmm1, %xmm4
-; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pmaxud %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm5, %xmm1
-; SSE41-NEXT: por %xmm3, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE41-NEXT: packssdw %xmm1, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: pshufb %xmm6, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm7
+; SSE41-NEXT: pmaxud %xmm2, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pshufb %xmm6, %xmm7
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
; SSE41-NEXT: psubd %xmm2, %xmm3
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm1, %xmm4
-; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: psubd %xmm1, %xmm4
+; SSE41-NEXT: pshufb %xmm6, %xmm4
+; SSE41-NEXT: pshufb %xmm6, %xmm3
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
; SSE41-NEXT: pandn %xmm4, %xmm0
; SSE41-NEXT: retq
@@ -521,15 +526,15 @@ define <8 x i16> @test13(<8 x i16> %x, <
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm4
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmaxud %xmm5, %xmm2, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -543,10 +548,10 @@ define <8 x i16> @test13(<8 x i16> %x, <
; AVX2-LABEL: test13:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
@@ -682,44 +687,43 @@ define <16 x i8> @test14(<16 x i8> %x, <
;
; SSE41-LABEL: test14:
; SSE41: # %bb.0: # %vector.ph
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,3,0,1]
+; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxbd {{.*#+}} xmm9 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: psubd %xmm6, %xmm4
-; SSE41-NEXT: por %xmm5, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE41-NEXT: movdqa {{.*#+}} xmm10 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm10, %xmm6
-; SSE41-NEXT: movdqa %xmm3, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: psubd %xmm9, %xmm3
-; SSE41-NEXT: por %xmm5, %xmm9
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm9
-; SSE41-NEXT: pshufb %xmm10, %xmm9
-; SSE41-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; SSE41-NEXT: movdqa %xmm1, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: psubd %xmm0, %xmm1
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm6, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT: pmovzxbd {{.*#+}} xmm10 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: pmaxud %xmm10, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm6, %xmm6
+; SSE41-NEXT: pxor %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm7 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pshufb %xmm7, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm5
+; SSE41-NEXT: pmaxud %xmm9, %xmm5
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
+; SSE41-NEXT: pxor %xmm6, %xmm5
+; SSE41-NEXT: pshufb %xmm7, %xmm5
+; SSE41-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; SSE41-NEXT: movdqa %xmm1, %xmm0
+; SSE41-NEXT: pmaxud %xmm8, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: pxor %xmm6, %xmm0
+; SSE41-NEXT: movdqa {{.*#+}} xmm12 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT: pshufb %xmm12, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: por %xmm8, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm5
-; SSE41-NEXT: pshufb %xmm6, %xmm5
-; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4,5,6,7]
-; SSE41-NEXT: psubd %xmm8, %xmm2
+; SSE41-NEXT: pmaxud %xmm11, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
+; SSE41-NEXT: pxor %xmm6, %xmm7
+; SSE41-NEXT: pshufb %xmm12, %xmm7
+; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4,5,6,7]
+; SSE41-NEXT: psubd %xmm11, %xmm2
+; SSE41-NEXT: psubd %xmm8, %xmm1
+; SSE41-NEXT: psubd %xmm9, %xmm3
+; SSE41-NEXT: psubd %xmm10, %xmm4
; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; SSE41-NEXT: pand %xmm5, %xmm4
; SSE41-NEXT: pand %xmm5, %xmm3
@@ -737,31 +741,31 @@ define <16 x i8> @test14(<16 x i8> %x, <
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm9 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm11 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vpxor %xmm6, %xmm7, %xmm3
-; AVX1-NEXT: vpor %xmm6, %xmm0, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm4
-; AVX1-NEXT: vpor %xmm6, %xmm10, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm11
-; AVX1-NEXT: vpxor %xmm6, %xmm1, %xmm4
-; AVX1-NEXT: vpor %xmm6, %xmm9, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm3
-; AVX1-NEXT: vpor %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpacksswb %xmm11, %xmm3, %xmm3
-; AVX1-NEXT: vpsubd %xmm8, %xmm5, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
+; AVX1-NEXT: vpmaxud %xmm0, %xmm6, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
+; AVX1-NEXT: vpmaxud %xmm11, %xmm2, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm4
+; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm10
+; AVX1-NEXT: vpmaxud %xmm9, %xmm1, %xmm7
+; AVX1-NEXT: vpcmpeqd %xmm7, %xmm1, %xmm7
+; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpmaxud %xmm8, %xmm4, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm7, %xmm3
+; AVX1-NEXT: vpacksswb %xmm10, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm8, %xmm4, %xmm4
; AVX1-NEXT: vpsubd %xmm9, %xmm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm10, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpsubd %xmm11, %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2
@@ -779,18 +783,18 @@ define <16 x i8> @test14(<16 x i8> %x, <
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm6
-; AVX2-NEXT: vpcmpgtd %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm6
-; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm4
-; AVX2-NEXT: vpcmpgtd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm4, %ymm1, %ymm4
+; AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5
+; AVX2-NEXT: vpxor %ymm5, %ymm4, %ymm4
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX2-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpacksswb %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vpmaxud %ymm3, %ymm2, %ymm6
+; AVX2-NEXT: vpcmpeqd %ymm6, %ymm2, %ymm6
+; AVX2-NEXT: vpxor %ymm5, %ymm6, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
+; AVX2-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
@@ -881,23 +885,25 @@ define <8 x i16> @test15(<8 x i16> %x, <
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psubd %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: por %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: por %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: packssdw %xmm4, %xmm5
+; SSE41-NEXT: movdqa %xmm0, %xmm4
+; SSE41-NEXT: pminud %xmm1, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: pshufb %xmm6, %xmm4
+; SSE41-NEXT: movdqa %xmm3, %xmm7
+; SSE41-NEXT: pminud %xmm2, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pshufb %xmm6, %xmm7
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
; SSE41-NEXT: psubd %xmm2, %xmm3
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm1, %xmm0
-; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pshufb %xmm6, %xmm0
+; SSE41-NEXT: pshufb %xmm6, %xmm3
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test15:
@@ -905,15 +911,15 @@ define <8 x i16> @test15(<8 x i16> %x, <
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpminud %xmm5, %xmm2, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm2, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -927,10 +933,10 @@ define <8 x i16> @test15(<8 x i16> %x, <
; AVX2-LABEL: test15:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
@@ -1015,23 +1021,25 @@ define <8 x i16> @test16(<8 x i16> %x, <
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psubd %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: por %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: por %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE41-NEXT: packssdw %xmm4, %xmm5
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: pmaxud %xmm0, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm4
+; SSE41-NEXT: pcmpeqd %xmm5, %xmm5
+; SSE41-NEXT: pxor %xmm5, %xmm4
+; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: pshufb %xmm6, %xmm4
+; SSE41-NEXT: movdqa %xmm2, %xmm7
+; SSE41-NEXT: pmaxud %xmm3, %xmm7
+; SSE41-NEXT: pcmpeqd %xmm2, %xmm7
+; SSE41-NEXT: pxor %xmm5, %xmm7
+; SSE41-NEXT: pshufb %xmm6, %xmm7
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
; SSE41-NEXT: psubd %xmm2, %xmm3
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm1, %xmm0
-; SSE41-NEXT: pshufb %xmm1, %xmm3
+; SSE41-NEXT: psubd %xmm1, %xmm0
+; SSE41-NEXT: pshufb %xmm6, %xmm0
+; SSE41-NEXT: pshufb %xmm6, %xmm3
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE41-NEXT: pand %xmm5, %xmm0
+; SSE41-NEXT: pand %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: test16:
@@ -1039,15 +1047,15 @@ define <8 x i16> @test16(<8 x i16> %x, <
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpmaxud %xmm0, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmaxud %xmm2, %xmm5, %xmm6
+; AVX1-NEXT: vpcmpeqd %xmm6, %xmm5, %xmm6
+; AVX1-NEXT: vpxor %xmm4, %xmm6, %xmm4
+; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpsubd %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
@@ -1061,10 +1069,10 @@ define <8 x i16> @test16(<8 x i16> %x, <
; AVX2-LABEL: test16:
; AVX2: # %bb.0: # %vector.ph
; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpmaxud %ymm0, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
Modified: llvm/trunk/test/CodeGen/X86/vec_cmp_uint-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_cmp_uint-128.ll?rev=324842&r1=324841&r2=324842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_cmp_uint-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_cmp_uint-128.ll Sun Feb 11 09:11:40 2018
@@ -506,46 +506,81 @@ define <2 x i64> @gt_v2i64(<2 x i64> %a,
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: gt_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: gt_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: gt_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: gt_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vpcomgtuq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
+;
+; AVX512-LABEL: gt_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512-NEXT: vpminuq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ugt <2 x i64> %a, %b
%2 = sext <2 x i1> %1 to <2 x i64>
ret <2 x i64> %2
}
define <4 x i32> @gt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
-; SSE-LABEL: gt_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: gt_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: gt_v4i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pminud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE42-LABEL: gt_v4i32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pminud %xmm0, %xmm1
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT: pxor %xmm1, %xmm0
+; SSE42-NEXT: retq
;
; AVX1-LABEL: gt_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: gt_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOP-LABEL: gt_v4i32:
@@ -555,10 +590,11 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a,
;
; AVX512-LABEL: gt_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp ugt <4 x i32> %a, %b
%2 = sext <4 x i1> %1 to <4 x i32>
@@ -566,26 +602,59 @@ define <4 x i32> @gt_v4i32(<4 x i32> %a,
}
define <8 x i16> @gt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
-; SSE-LABEL: gt_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: gt_v8i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: pxor %xmm2, %xmm1
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pcmpgtw %xmm1, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: gt_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; SSE41-LABEL: gt_v8i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pminuw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE42-LABEL: gt_v8i16:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pminuw %xmm0, %xmm1
+; SSE42-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT: pxor %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: gt_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: gt_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: gt_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vpcomgtuw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
+;
+; AVX512-LABEL: gt_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ugt <8 x i16> %a, %b
%2 = sext <8 x i1> %1 to <8 x i16>
ret <8 x i16> %2
@@ -594,24 +663,41 @@ define <8 x i16> @gt_v8i16(<8 x i16> %a,
define <16 x i8> @gt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: gt_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pcmpgtb %xmm1, %xmm0
+; SSE-NEXT: pminub %xmm0, %xmm1
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: gt_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: gt_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: gt_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: gt_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vpcomgtub %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
+;
+; AVX512-LABEL: gt_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ugt <16 x i8> %a, %b
%2 = sext <16 x i1> %1 to <16 x i8>
ret <16 x i8> %2
@@ -843,47 +929,82 @@ define <2 x i64> @lt_v2i64(<2 x i64> %a,
; SSE42-NEXT: movdqa %xmm2, %xmm0
; SSE42-NEXT: retq
;
-; AVX-LABEL: lt_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: lt_v2i64:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: lt_v2i64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: lt_v2i64:
; XOP: # %bb.0:
; XOP-NEXT: vpcomltuq %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
+;
+; AVX512-LABEL: lt_v2i64:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
+; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ult <2 x i64> %a, %b
%2 = sext <2 x i1> %1 to <2 x i64>
ret <2 x i64> %2
}
define <4 x i32> @lt_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
-; SSE-LABEL: lt_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm2
-; SSE-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: lt_v4i32:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: lt_v4i32:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmaxud %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE42-LABEL: lt_v4i32:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pmaxud %xmm0, %xmm1
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT: pxor %xmm1, %xmm0
+; SSE42-NEXT: retq
;
; AVX1-LABEL: lt_v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: lt_v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: retq
;
; XOP-LABEL: lt_v4i32:
@@ -893,10 +1014,11 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a,
;
; AVX512-LABEL: lt_v4i32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = icmp ult <4 x i32> %a, %b
%2 = sext <4 x i1> %1 to <4 x i32>
@@ -904,27 +1026,60 @@ define <4 x i32> @lt_v4i32(<4 x i32> %a,
}
define <8 x i16> @lt_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
-; SSE-LABEL: lt_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm2
-; SSE-NEXT: pcmpgtw %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: lt_v8i16:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtw %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
;
-; AVX-LABEL: lt_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; SSE41-LABEL: lt_v8i16:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pmaxuw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: retq
+;
+; SSE42-LABEL: lt_v8i16:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pmaxuw %xmm0, %xmm1
+; SSE42-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE42-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE42-NEXT: pxor %xmm1, %xmm0
+; SSE42-NEXT: retq
+;
+; AVX1-LABEL: lt_v8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: lt_v8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: lt_v8i16:
; XOP: # %bb.0:
; XOP-NEXT: vpcomltuw %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
+;
+; AVX512-LABEL: lt_v8i16:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ult <8 x i16> %a, %b
%2 = sext <8 x i1> %1 to <8 x i16>
ret <8 x i16> %2
@@ -933,25 +1088,41 @@ define <8 x i16> @lt_v8i16(<8 x i16> %a,
define <16 x i8> @lt_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE-LABEL: lt_v16i8:
; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm2
-; SSE-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm0
+; SSE-NEXT: pmaxub %xmm0, %xmm1
+; SSE-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE-NEXT: pxor %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: lt_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
+; AVX1-LABEL: lt_v16i8:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: lt_v16i8:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: retq
;
; XOP-LABEL: lt_v16i8:
; XOP: # %bb.0:
; XOP-NEXT: vpcomltub %xmm1, %xmm0, %xmm0
; XOP-NEXT: retq
+;
+; AVX512-LABEL: lt_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm1
+; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%1 = icmp ult <16 x i8> %a, %b
%2 = sext <16 x i1> %1 to <16 x i8>
ret <16 x i8> %2
Modified: llvm/trunk/test/CodeGen/X86/vec_minmax_match.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_minmax_match.ll?rev=324842&r1=324841&r2=324842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_minmax_match.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_minmax_match.ll Sun Feb 11 09:11:40 2018
@@ -223,11 +223,12 @@ define <4 x i32> @wrong_pred_for_smin_wi
; CHECK-LABEL: wrong_pred_for_smin_with_not:
; CHECK: # %bb.0:
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm1
-; CHECK-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT: vpcmpgtd {{.*}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4294967291,4294967291,4294967291,4294967291]
-; CHECK-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm2
+; CHECK-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vmovaps {{.*#+}} xmm1 = [4294967291,4294967291,4294967291,4294967291]
+; CHECK-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; CHECK-NEXT: retq
%not_x = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
%cmp = icmp ugt <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
Modified: llvm/trunk/test/CodeGen/X86/vec_setcc-2.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vec_setcc-2.ll?rev=324842&r1=324841&r2=324842&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vec_setcc-2.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vec_setcc-2.ll Sun Feb 11 09:11:40 2018
@@ -5,26 +5,49 @@
; For a setult against a constant, turn it into a setule and lower via psubusw.
define void @loop_no_const_reload(<2 x i64>* %in, <2 x i64>* %out, i32 %n) {
-; CHECK-LABEL: loop_no_const_reload:
-; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: testl %edx, %edx
-; CHECK-NEXT: je LBB0_3
-; CHECK-NEXT: ## %bb.1: ## %for.body.preheader
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25]
-; CHECK-NEXT: pxor %xmm1, %xmm1
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: LBB0_2: ## %for.body
-; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movdqa (%rdi,%rax), %xmm2
-; CHECK-NEXT: psubusw %xmm0, %xmm2
-; CHECK-NEXT: pcmpeqw %xmm1, %xmm2
-; CHECK-NEXT: movdqa %xmm2, (%rsi,%rax)
-; CHECK-NEXT: addq $16, %rax
-; CHECK-NEXT: decl %edx
-; CHECK-NEXT: jne LBB0_2
-; CHECK-NEXT: LBB0_3: ## %for.end
-; CHECK-NEXT: retq
+; SSE2-LABEL: loop_no_const_reload:
+; SSE2: ## %bb.0: ## %entry
+; SSE2-NEXT: testl %edx, %edx
+; SSE2-NEXT: je LBB0_3
+; SSE2-NEXT: ## %bb.1: ## %for.body.preheader
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [25,25,25,25,25,25,25,25]
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: LBB0_2: ## %for.body
+; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movdqa (%rdi,%rax), %xmm2
+; SSE2-NEXT: psubusw %xmm0, %xmm2
+; SSE2-NEXT: pcmpeqw %xmm1, %xmm2
+; SSE2-NEXT: movdqa %xmm2, (%rsi,%rax)
+; SSE2-NEXT: addq $16, %rax
+; SSE2-NEXT: decl %edx
+; SSE2-NEXT: jne LBB0_2
+; SSE2-NEXT: LBB0_3: ## %for.end
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: loop_no_const_reload:
+; SSE41: ## %bb.0: ## %entry
+; SSE41-NEXT: testl %edx, %edx
+; SSE41-NEXT: je LBB0_3
+; SSE41-NEXT: ## %bb.1: ## %for.body.preheader
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [26,26,26,26,26,26,26,26]
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: .p2align 4, 0x90
+; SSE41-NEXT: LBB0_2: ## %for.body
+; SSE41-NEXT: ## =>This Inner Loop Header: Depth=1
+; SSE41-NEXT: movdqa (%rdi,%rax), %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pmaxuw %xmm0, %xmm3
+; SSE41-NEXT: pcmpeqw %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm3, (%rsi,%rax)
+; SSE41-NEXT: addq $16, %rax
+; SSE41-NEXT: decl %edx
+; SSE41-NEXT: jne LBB0_2
+; SSE41-NEXT: LBB0_3: ## %for.end
+; SSE41-NEXT: retq
entry:
%cmp9 = icmp eq i32 %n, 0
br i1 %cmp9, label %for.end, label %for.body
@@ -51,27 +74,50 @@ for.end:
; Be careful if decrementing the constant would undeflow.
define void @loop_const_folding_underflow(<2 x i64>* %in, <2 x i64>* %out, i32 %n) {
-; CHECK-LABEL: loop_const_folding_underflow:
-; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: testl %edx, %edx
-; CHECK-NEXT: je LBB1_3
-; CHECK-NEXT: ## %bb.1: ## %for.body.preheader
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [32768,32794,32794,32794,32794,32794,32794,32794]
-; CHECK-NEXT: .p2align 4, 0x90
-; CHECK-NEXT: LBB1_2: ## %for.body
-; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movdqa (%rdi,%rax), %xmm2
-; CHECK-NEXT: pxor %xmm0, %xmm2
-; CHECK-NEXT: movdqa %xmm1, %xmm3
-; CHECK-NEXT: pcmpgtw %xmm2, %xmm3
-; CHECK-NEXT: movdqa %xmm3, (%rsi,%rax)
-; CHECK-NEXT: addq $16, %rax
-; CHECK-NEXT: decl %edx
-; CHECK-NEXT: jne LBB1_2
-; CHECK-NEXT: LBB1_3: ## %for.end
-; CHECK-NEXT: retq
+; SSE2-LABEL: loop_const_folding_underflow:
+; SSE2: ## %bb.0: ## %entry
+; SSE2-NEXT: testl %edx, %edx
+; SSE2-NEXT: je LBB1_3
+; SSE2-NEXT: ## %bb.1: ## %for.body.preheader
+; SSE2-NEXT: xorl %eax, %eax
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [32768,32794,32794,32794,32794,32794,32794,32794]
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: LBB1_2: ## %for.body
+; SSE2-NEXT: ## =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movdqa (%rdi,%rax), %xmm2
+; SSE2-NEXT: pxor %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm1, %xmm3
+; SSE2-NEXT: pcmpgtw %xmm2, %xmm3
+; SSE2-NEXT: movdqa %xmm3, (%rsi,%rax)
+; SSE2-NEXT: addq $16, %rax
+; SSE2-NEXT: decl %edx
+; SSE2-NEXT: jne LBB1_2
+; SSE2-NEXT: LBB1_3: ## %for.end
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: loop_const_folding_underflow:
+; SSE41: ## %bb.0: ## %entry
+; SSE41-NEXT: testl %edx, %edx
+; SSE41-NEXT: je LBB1_3
+; SSE41-NEXT: ## %bb.1: ## %for.body.preheader
+; SSE41-NEXT: xorl %eax, %eax
+; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [0,26,26,26,26,26,26,26]
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: .p2align 4, 0x90
+; SSE41-NEXT: LBB1_2: ## %for.body
+; SSE41-NEXT: ## =>This Inner Loop Header: Depth=1
+; SSE41-NEXT: movdqa (%rdi,%rax), %xmm2
+; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: pmaxuw %xmm0, %xmm3
+; SSE41-NEXT: pcmpeqw %xmm2, %xmm3
+; SSE41-NEXT: pxor %xmm1, %xmm3
+; SSE41-NEXT: movdqa %xmm3, (%rsi,%rax)
+; SSE41-NEXT: addq $16, %rax
+; SSE41-NEXT: decl %edx
+; SSE41-NEXT: jne LBB1_2
+; SSE41-NEXT: LBB1_3: ## %for.end
+; SSE41-NEXT: retq
entry:
%cmp9 = icmp eq i32 %n, 0
br i1 %cmp9, label %for.end, label %for.body
@@ -100,9 +146,11 @@ for.end:
define <16 x i8> @test_ult_byte(<16 x i8> %a) {
; CHECK-LABEL: test_ult_byte:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: psubusb {{.*}}(%rip), %xmm0
-; CHECK-NEXT: pxor %xmm1, %xmm1
+; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11]
+; CHECK-NEXT: pmaxub %xmm0, %xmm1
; CHECK-NEXT: pcmpeqb %xmm1, %xmm0
+; CHECK-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT: pxor %xmm1, %xmm0
; CHECK-NEXT: retq
entry:
%icmp = icmp ult <16 x i8> %a, <i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11>
@@ -114,14 +162,22 @@ entry:
; register operands.
define <8 x i16> @test_ult_register(<8 x i16> %a, <8 x i16> %b) {
-; CHECK-LABEL: test_ult_register:
-; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; CHECK-NEXT: pxor %xmm2, %xmm0
-; CHECK-NEXT: pxor %xmm1, %xmm2
-; CHECK-NEXT: pcmpgtw %xmm0, %xmm2
-; CHECK-NEXT: movdqa %xmm2, %xmm0
-; CHECK-NEXT: retq
+; SSE2-LABEL: test_ult_register:
+; SSE2: ## %bb.0: ## %entry
+; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; SSE2-NEXT: pxor %xmm2, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: pcmpgtw %xmm0, %xmm2
+; SSE2-NEXT: movdqa %xmm2, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: test_ult_register:
+; SSE41: ## %bb.0: ## %entry
+; SSE41-NEXT: pmaxuw %xmm0, %xmm1
+; SSE41-NEXT: pcmpeqw %xmm1, %xmm0
+; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
+; SSE41-NEXT: pxor %xmm1, %xmm0
+; SSE41-NEXT: retq
entry:
%icmp = icmp ult <8 x i16> %a, %b
%sext = sext <8 x i1> %icmp to <8 x i16>
More information about the llvm-commits
mailing list