[llvm] r332904 - [DAGCombine][X86][AArch64] Masked merge unfolding: vector edition.
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Mon May 21 14:41:03 PDT 2018
Author: lebedevri
Date: Mon May 21 14:41:02 2018
New Revision: 332904
URL: http://llvm.org/viewvc/llvm-project?rev=332904&view=rev
Log:
[DAGCombine][X86][AArch64] Masked merge unfolding: vector edition.
Summary:
This **appears** to be the last missing piece for the masked merge pattern handling in the backend.
This is [[ https://bugs.llvm.org/show_bug.cgi?id=37104 | PR37104 ]].
[[ https://bugs.llvm.org/show_bug.cgi?id=6773 | PR6773 ]] will introduce an IR canonicalization that is likely bad for the end assembly.
Previously, `andps`+`andnps` / `bsl` would be generated. (see `@out`)
Now, they would no longer be generated (see `@in`), and we need to make sure that they are generated.
Differential Revision: https://reviews.llvm.org/D46528
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
llvm/trunk/test/CodeGen/X86/machine-cp.ll
llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=332904&r1=332903&r2=332904&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Mon May 21 14:41:02 2018
@@ -5391,10 +5391,6 @@ SDValue DAGCombiner::unfoldMaskedMerge(S
EVT VT = N->getValueType(0);
- // FIXME
- if (VT.isVector())
- return SDValue();
-
// There are 3 commutable operators in the pattern,
// so we have to deal with 8 possible variants of the basic pattern.
SDValue X, Y, M;
Modified: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h?rev=332904&r1=332903&r2=332904&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h (original)
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h Mon May 21 14:41:02 2018
@@ -443,9 +443,18 @@ public:
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
- bool hasAndNotCompare(SDValue) const override {
- // 'bics'
- return true;
+ bool hasAndNotCompare(SDValue V) const override {
+ // We can use bics for any scalar.
+ return V.getValueType().isScalarInteger();
+ }
+
+ bool hasAndNot(SDValue Y) const override {
+ EVT VT = Y.getValueType();
+
+ if (!VT.isVector())
+ return hasAndNotCompare(Y);
+
+ return VT.getSizeInBits() >= 64; // vector 'bic'
}
bool hasBitPreservingFPLogic(EVT VT) const override {
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=332904&r1=332903&r2=332904&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon May 21 14:41:02 2018
@@ -4751,26 +4751,39 @@ bool X86TargetLowering::isMaskAndCmp0Fol
}
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
- // A mask and compare against constant is ok for an 'andn' too
- // even though the BMI instruction doesn't have an immediate form.
+ EVT VT = Y.getValueType();
+
+ if (VT.isVector())
+ return false;
if (!Subtarget.hasBMI())
return false;
// There are only 32-bit and 64-bit forms for 'andn'.
- EVT VT = Y.getValueType();
if (VT != MVT::i32 && VT != MVT::i64)
return false;
+ // A mask and compare against constant is ok for an 'andn' too
+ // even though the BMI instruction doesn't have an immediate form.
+
return true;
}
bool X86TargetLowering::hasAndNot(SDValue Y) const {
- // x86 can't form 'andn' with an immediate.
- if (isa<ConstantSDNode>(Y))
+ EVT VT = Y.getValueType();
+
+ if (!VT.isVector()) // x86 can't form 'andn' with an immediate.
+ return !isa<ConstantSDNode>(Y) && hasAndNotCompare(Y);
+
+ // Vector.
+
+ if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
return false;
- return hasAndNotCompare(Y);
+ if (VT == MVT::v4i32)
+ return true;
+
+ return Subtarget.hasSSE2();
}
MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
Modified: llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll?rev=332904&r1=332903&r2=332904&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask-const.ll Mon May 21 14:41:02 2018
@@ -77,9 +77,8 @@ define <4 x i32> @in_constant_varx_42(<4
; CHECK-LABEL: in_constant_varx_42:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #42
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
%n1 = and <4 x i32> %n0, %mask
@@ -107,9 +106,8 @@ define <4 x i32> @in_constant_varx_42_in
; CHECK-LABEL: in_constant_varx_42_invmask:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #42
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bsl v2.16b, v1.16b, v0.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
%n0 = xor <4 x i32> %x, <i32 42, i32 42, i32 42, i32 42> ; %x
@@ -134,8 +132,8 @@ define <4 x i32> @out_constant_mone_vary
define <4 x i32> @in_constant_mone_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
; CHECK-LABEL: in_constant_mone_vary:
; CHECK: // %bb.0:
-; CHECK-NEXT: bic v0.16b, v2.16b, v1.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bic v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: orr v0.16b, v2.16b, v0.16b
; CHECK-NEXT: ret
%n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
%n1 = and <4 x i32> %n0, %mask
@@ -161,9 +159,8 @@ define <4 x i32> @out_constant_mone_vary
define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
; CHECK-LABEL: in_constant_mone_vary_invmask:
; CHECK: // %bb.0:
-; CHECK-NEXT: mvn v0.16b, v1.16b
-; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: and v0.16b, v1.16b, v2.16b
+; CHECK-NEXT: orn v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
%n0 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %y ; %x
@@ -189,10 +186,9 @@ define <4 x i32> @out_constant_42_vary(<
define <4 x i32> @in_constant_42_vary(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
; CHECK-LABEL: in_constant_42_vary:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.4s, #42
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: movi v2.4s, #42
+; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b
; CHECK-NEXT: ret
%n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
%n1 = and <4 x i32> %n0, %mask
@@ -219,10 +215,9 @@ define <4 x i32> @out_constant_42_vary_i
define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
; CHECK-LABEL: in_constant_42_vary_invmask:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v0.4s, #42
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: bic v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
+; CHECK-NEXT: movi v2.4s, #42
+; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
%n0 = xor <4 x i32> <i32 42, i32 42, i32 42, i32 42>, %y ; %x
Modified: llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll?rev=332904&r1=332903&r2=332904&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll (original)
+++ llvm/trunk/test/CodeGen/AArch64/unfold-masked-merge-vector-variablemask.ll Mon May 21 14:41:02 2018
@@ -270,9 +270,8 @@ define <2 x i64> @out_v2i64(<2 x i64> %x
define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
; CHECK-LABEL: in_v1i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <1 x i8> %x, %y
%n1 = and <1 x i8> %n0, %mask
@@ -287,9 +286,8 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1
define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
; CHECK-LABEL: in_v2i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <2 x i8> %x, %y
%n1 = and <2 x i8> %n0, %mask
@@ -300,9 +298,8 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2
define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
; CHECK-LABEL: in_v1i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <1 x i16> %x, %y
%n1 = and <1 x i16> %n0, %mask
@@ -317,9 +314,8 @@ define <1 x i16> @in_v1i16(<1 x i16> %x,
define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
; CHECK-LABEL: in_v4i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <4 x i8> %x, %y
%n1 = and <4 x i8> %n0, %mask
@@ -330,9 +326,8 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4
define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
; CHECK-LABEL: in_v2i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <2 x i16> %x, %y
%n1 = and <2 x i16> %n0, %mask
@@ -343,9 +338,8 @@ define <2 x i16> @in_v2i16(<2 x i16> %x,
define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
; CHECK-LABEL: in_v1i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <1 x i32> %x, %y
%n1 = and <1 x i32> %n0, %mask
@@ -360,9 +354,8 @@ define <1 x i32> @in_v1i32(<1 x i32> %x,
define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
; CHECK-LABEL: in_v8i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <8 x i8> %x, %y
%n1 = and <8 x i8> %n0, %mask
@@ -373,9 +366,8 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8
define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
; CHECK-LABEL: in_v4i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <4 x i16> %x, %y
%n1 = and <4 x i16> %n0, %mask
@@ -386,9 +378,8 @@ define <4 x i16> @in_v4i16(<4 x i16> %x,
define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
; CHECK-LABEL: in_v2i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <2 x i32> %x, %y
%n1 = and <2 x i32> %n0, %mask
@@ -399,9 +390,8 @@ define <2 x i32> @in_v2i32(<2 x i32> %x,
define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
; CHECK-LABEL: in_v1i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
-; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: bsl v2.8b, v0.8b, v1.8b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <1 x i64> %x, %y
%n1 = and <1 x i64> %n0, %mask
@@ -416,9 +406,8 @@ define <1 x i64> @in_v1i64(<1 x i64> %x,
define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
; CHECK-LABEL: in_v16i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <16 x i8> %x, %y
%n1 = and <16 x i8> %n0, %mask
@@ -429,9 +418,8 @@ define <16 x i8> @in_v16i8(<16 x i8> %x,
define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
; CHECK-LABEL: in_v8i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <8 x i16> %x, %y
%n1 = and <8 x i16> %n0, %mask
@@ -442,9 +430,8 @@ define <8 x i16> @in_v8i16(<8 x i16> %x,
define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
; CHECK-LABEL: in_v4i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <4 x i32> %x, %y
%n1 = and <4 x i32> %n0, %mask
@@ -455,9 +442,8 @@ define <4 x i32> @in_v4i32(<4 x i32> %x,
define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
; CHECK-LABEL: in_v2i64:
; CHECK: // %bb.0:
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: bsl v2.16b, v0.16b, v1.16b
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
%n0 = xor <2 x i64> %x, %y
%n1 = and <2 x i64> %n0, %mask
Modified: llvm/trunk/test/CodeGen/X86/machine-cp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/machine-cp.ll?rev=332904&r1=332903&r2=332904&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/machine-cp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/machine-cp.ll Mon May 21 14:41:02 2018
@@ -101,54 +101,64 @@ while.end:
define <16 x float> @foo(<16 x float> %x) {
; CHECK-LABEL: foo:
; CHECK: ## %bb.0: ## %bb
-; CHECK-NEXT: xorps %xmm8, %xmm8
-; CHECK-NEXT: cvttps2dq %xmm3, %xmm9
-; CHECK-NEXT: movaps %xmm3, %xmm13
-; CHECK-NEXT: cmpltps %xmm8, %xmm13
-; CHECK-NEXT: movaps {{.*#+}} xmm7 = [1,1,1,1]
-; CHECK-NEXT: movaps %xmm13, %xmm3
-; CHECK-NEXT: andps %xmm7, %xmm3
-; CHECK-NEXT: cvttps2dq %xmm2, %xmm10
-; CHECK-NEXT: movaps %xmm2, %xmm5
-; CHECK-NEXT: cmpltps %xmm8, %xmm5
-; CHECK-NEXT: movaps %xmm5, %xmm2
-; CHECK-NEXT: andps %xmm7, %xmm2
-; CHECK-NEXT: cvttps2dq %xmm1, %xmm11
+; CHECK-NEXT: movaps %xmm3, %xmm9
+; CHECK-NEXT: movaps %xmm2, %xmm8
+; CHECK-NEXT: movaps %xmm1, %xmm6
+; CHECK-NEXT: movaps %xmm0, %xmm7
+; CHECK-NEXT: xorps %xmm0, %xmm0
+; CHECK-NEXT: movaps %xmm3, %xmm1
+; CHECK-NEXT: cmpltps %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm1, %xmm4
-; CHECK-NEXT: cmpltps %xmm8, %xmm4
-; CHECK-NEXT: movaps %xmm4, %xmm1
-; CHECK-NEXT: andps %xmm7, %xmm1
-; CHECK-NEXT: cvttps2dq %xmm0, %xmm12
-; CHECK-NEXT: movaps %xmm0, %xmm6
-; CHECK-NEXT: cmpltps %xmm8, %xmm6
-; CHECK-NEXT: andps %xmm6, %xmm7
-; CHECK-NEXT: orps {{.*}}(%rip), %xmm6
-; CHECK-NEXT: movaps {{.*#+}} xmm14 = [5,6,7,8]
-; CHECK-NEXT: orps %xmm14, %xmm4
-; CHECK-NEXT: movaps {{.*#+}} xmm15 = [9,10,11,12]
-; CHECK-NEXT: orps %xmm15, %xmm5
-; CHECK-NEXT: movaps {{.*#+}} xmm8 = [13,14,15,16]
-; CHECK-NEXT: orps %xmm8, %xmm13
-; CHECK-NEXT: cvtdq2ps %xmm12, %xmm0
-; CHECK-NEXT: cvtdq2ps %xmm11, %xmm11
-; CHECK-NEXT: cvtdq2ps %xmm10, %xmm10
-; CHECK-NEXT: cvtdq2ps %xmm9, %xmm9
-; CHECK-NEXT: andps %xmm8, %xmm9
-; CHECK-NEXT: andps %xmm15, %xmm10
-; CHECK-NEXT: andps %xmm14, %xmm11
-; CHECK-NEXT: andps {{.*}}(%rip), %xmm0
-; CHECK-NEXT: xorps %xmm7, %xmm0
+; CHECK-NEXT: orps {{.*}}(%rip), %xmm4
+; CHECK-NEXT: movaps %xmm4, %xmm10
+; CHECK-NEXT: andnps %xmm1, %xmm10
+; CHECK-NEXT: movaps %xmm2, %xmm1
+; CHECK-NEXT: cmpltps %xmm0, %xmm1
+; CHECK-NEXT: movaps {{.*#+}} xmm11 = [9,10,11,12]
+; CHECK-NEXT: movaps %xmm1, %xmm3
+; CHECK-NEXT: orps %xmm11, %xmm3
+; CHECK-NEXT: movaps %xmm3, %xmm14
+; CHECK-NEXT: andnps %xmm1, %xmm14
+; CHECK-NEXT: cvttps2dq %xmm6, %xmm12
+; CHECK-NEXT: cmpltps %xmm0, %xmm6
+; CHECK-NEXT: movaps {{.*#+}} xmm13 = [5,6,7,8]
+; CHECK-NEXT: movaps %xmm6, %xmm2
+; CHECK-NEXT: orps %xmm13, %xmm2
+; CHECK-NEXT: movaps %xmm2, %xmm5
+; CHECK-NEXT: andnps %xmm6, %xmm5
+; CHECK-NEXT: cvttps2dq %xmm7, %xmm6
+; CHECK-NEXT: cmpltps %xmm0, %xmm7
+; CHECK-NEXT: movaps {{.*#+}} xmm15 = [1,2,3,4]
+; CHECK-NEXT: movaps %xmm7, %xmm0
+; CHECK-NEXT: orps %xmm15, %xmm0
+; CHECK-NEXT: movaps %xmm0, %xmm1
+; CHECK-NEXT: andnps %xmm7, %xmm1
+; CHECK-NEXT: andps %xmm15, %xmm0
+; CHECK-NEXT: cvtdq2ps %xmm6, %xmm6
; CHECK-NEXT: andps %xmm6, %xmm0
-; CHECK-NEXT: xorps %xmm1, %xmm11
-; CHECK-NEXT: andps %xmm4, %xmm11
-; CHECK-NEXT: xorps %xmm2, %xmm10
-; CHECK-NEXT: andps %xmm5, %xmm10
-; CHECK-NEXT: xorps %xmm3, %xmm9
-; CHECK-NEXT: andps %xmm13, %xmm9
-; CHECK-NEXT: xorps %xmm7, %xmm0
-; CHECK-NEXT: xorps %xmm11, %xmm1
-; CHECK-NEXT: xorps %xmm10, %xmm2
-; CHECK-NEXT: xorps %xmm9, %xmm3
+; CHECK-NEXT: movaps {{.*#+}} xmm6 = [1,1,1,1]
+; CHECK-NEXT: andps %xmm6, %xmm1
+; CHECK-NEXT: orps %xmm1, %xmm0
+; CHECK-NEXT: andps %xmm13, %xmm2
+; CHECK-NEXT: cvtdq2ps %xmm12, %xmm1
+; CHECK-NEXT: andps %xmm1, %xmm2
+; CHECK-NEXT: andps %xmm6, %xmm5
+; CHECK-NEXT: orps %xmm5, %xmm2
+; CHECK-NEXT: andps %xmm11, %xmm3
+; CHECK-NEXT: cvttps2dq %xmm8, %xmm1
+; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1
+; CHECK-NEXT: andps %xmm1, %xmm3
+; CHECK-NEXT: andps %xmm6, %xmm14
+; CHECK-NEXT: orps %xmm14, %xmm3
+; CHECK-NEXT: andps %xmm6, %xmm10
+; CHECK-NEXT: andps {{.*}}(%rip), %xmm4
+; CHECK-NEXT: cvttps2dq %xmm9, %xmm1
+; CHECK-NEXT: cvtdq2ps %xmm1, %xmm1
+; CHECK-NEXT: andps %xmm1, %xmm4
+; CHECK-NEXT: orps %xmm10, %xmm4
+; CHECK-NEXT: movaps %xmm2, %xmm1
+; CHECK-NEXT: movaps %xmm3, %xmm2
+; CHECK-NEXT: movaps %xmm4, %xmm3
; CHECK-NEXT: retq
bb:
%v3 = icmp slt <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, zeroinitializer
Modified: llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll?rev=332904&r1=332903&r2=332904&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll (original)
+++ llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask-const.ll Mon May 21 14:41:02 2018
@@ -58,18 +58,20 @@ define <4 x i32> @in_constant_varx_mone(
;
; CHECK-SSE2-LABEL: in_constant_varx_mone:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0
+; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0
; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-SSE2-NEXT: pandn (%rdx), %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pand (%rdi), %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_varx_mone:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vpandn (%rdx), %xmm0, %xmm0
-; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; CHECK-XOP-NEXT: vpand (%rdi), %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -132,21 +134,22 @@ define <4 x i32> @in_constant_varx_mone_
;
; CHECK-SSE2-LABEL: in_constant_varx_mone_invmask:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa (%rdi), %xmm0
-; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm2
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm2
-; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
+; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm1
+; CHECK-SSE2-NEXT: pcmpeqd %xmm0, %xmm0
+; CHECK-SSE2-NEXT: pxor %xmm0, %xmm0
; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0
+; CHECK-SSE2-NEXT: pandn (%rdi), %xmm1
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_varx_mone_invmask:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm2
-; CHECK-XOP-NEXT: vpandn %xmm2, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpandn (%rdi), %xmm0, %xmm2
+; CHECK-XOP-NEXT: vpxor %xmm1, %xmm1, %xmm1
; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpor %xmm0, %xmm2, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -198,30 +201,29 @@ define <4 x i32> @out_constant_varx_42(<
define <4 x i32> @in_constant_varx_42(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
; CHECK-SSE1-LABEL: in_constant_varx_42:
; CHECK-SSE1: # %bb.0:
-; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm0 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
+; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0
; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: andps (%rcx), %xmm1
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT: andps %xmm0, %xmm1
+; CHECK-SSE1-NEXT: andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT: orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi)
; CHECK-SSE1-NEXT: movq %rdi, %rax
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: in_constant_varx_42:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm1 = [42,42,42,42]
-; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
-; CHECK-SSE2-NEXT: andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT: movaps (%rdi), %xmm1
+; CHECK-SSE2-NEXT: andps %xmm0, %xmm1
+; CHECK-SSE2-NEXT: andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_varx_42:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42]
-; CHECK-XOP-NEXT: vxorps (%rdi), %xmm0, %xmm1
-; CHECK-XOP-NEXT: vandps (%rdx), %xmm1, %xmm1
-; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT: vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -275,11 +277,10 @@ define <4 x i32> @in_constant_varx_42_in
; CHECK-SSE1-LABEL: in_constant_varx_42_invmask:
; CHECK-SSE1: # %bb.0:
; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0
-; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
-; CHECK-SSE1-NEXT: movaps (%rsi), %xmm2
-; CHECK-SSE1-NEXT: xorps %xmm1, %xmm2
-; CHECK-SSE1-NEXT: andnps %xmm2, %xmm0
-; CHECK-SSE1-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT: andnps (%rsi), %xmm1
+; CHECK-SSE1-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT: orps %xmm1, %xmm0
; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi)
; CHECK-SSE1-NEXT: movq %rdi, %rax
; CHECK-SSE1-NEXT: retq
@@ -287,20 +288,17 @@ define <4 x i32> @in_constant_varx_42_in
; CHECK-SSE2-LABEL: in_constant_varx_42_invmask:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm1 = [42,42,42,42]
-; CHECK-SSE2-NEXT: movaps (%rdi), %xmm2
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm2
-; CHECK-SSE2-NEXT: andnps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE2-NEXT: andnps (%rdi), %xmm1
+; CHECK-SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_varx_42_invmask:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm0
-; CHECK-XOP-NEXT: vmovaps {{.*#+}} xmm1 = [42,42,42,42]
-; CHECK-XOP-NEXT: vxorps (%rdi), %xmm1, %xmm2
-; CHECK-XOP-NEXT: vandnps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT: vpcmov %xmm0, (%rdi), %xmm1, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -350,27 +348,27 @@ define <4 x i32> @out_constant_mone_vary
define <4 x i32> @in_constant_mone_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
; CHECK-SSE1-LABEL: in_constant_mone_vary:
; CHECK-SSE1: # %bb.0:
-; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0
+; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0
; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: andnps (%rcx), %xmm1
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1
+; CHECK-SSE1-NEXT: orps %xmm0, %xmm1
; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi)
; CHECK-SSE1-NEXT: movq %rdi, %rax
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: in_constant_mone_vary:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1
+; CHECK-SSE2-NEXT: movaps (%rdx), %xmm1
; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0
-; CHECK-SSE2-NEXT: andnps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps (%rsi), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_mone_vary:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0
-; CHECK-XOP-NEXT: vandnps (%rdx), %xmm0, %xmm1
-; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm0
+; CHECK-XOP-NEXT: vandnps (%rsi), %xmm0, %xmm1
+; CHECK-XOP-NEXT: vorps %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -425,33 +423,31 @@ define <4 x i32> @out_constant_mone_vary
define <4 x i32> @in_constant_mone_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
; CHECK-SSE1-LABEL: in_constant_mone_vary_invmask:
; CHECK-SSE1: # %bb.0:
-; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0
-; CHECK-SSE1-NEXT: movaps (%rcx), %xmm1
-; CHECK-SSE1-NEXT: xorps {{.*}}(%rip), %xmm1
-; CHECK-SSE1-NEXT: movaps %xmm0, %xmm2
-; CHECK-SSE1-NEXT: andnps %xmm1, %xmm2
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm2
-; CHECK-SSE1-NEXT: movaps %xmm2, (%rdi)
+; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [nan,nan,nan,nan]
+; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
+; CHECK-SSE1-NEXT: andps (%rdx), %xmm0
+; CHECK-SSE1-NEXT: orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi)
; CHECK-SSE1-NEXT: movq %rdi, %rax
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: in_constant_mone_vary_invmask:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movdqa (%rsi), %xmm1
-; CHECK-SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; CHECK-SSE2-NEXT: pxor (%rdx), %xmm2
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE2-NEXT: pandn %xmm2, %xmm0
-; CHECK-SSE2-NEXT: pxor %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movdqa (%rdx), %xmm0
+; CHECK-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
+; CHECK-SSE2-NEXT: pxor %xmm0, %xmm1
+; CHECK-SSE2-NEXT: pand (%rsi), %xmm0
+; CHECK-SSE2-NEXT: por %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_mone_vary_invmask:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovdqa (%rsi), %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
; CHECK-XOP-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vpxor (%rdx), %xmm1, %xmm1
-; CHECK-XOP-NEXT: vpandn %xmm1, %xmm0, %xmm1
-; CHECK-XOP-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT: vpxor %xmm1, %xmm0, %xmm1
+; CHECK-XOP-NEXT: vpand (%rsi), %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpor %xmm0, %xmm1, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -503,30 +499,29 @@ define <4 x i32> @out_constant_42_vary(<
define <4 x i32> @in_constant_42_vary(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
; CHECK-SSE1-LABEL: in_constant_42_vary:
; CHECK-SSE1: # %bb.0:
-; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0
-; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm1 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: andps (%rcx), %xmm1
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1
+; CHECK-SSE1-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT: orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi)
; CHECK-SSE1-NEXT: movq %rdi, %rax
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: in_constant_42_vary:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1
-; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42]
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
-; CHECK-SSE2-NEXT: andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE2-NEXT: andnps (%rsi), %xmm1
+; CHECK-SSE2-NEXT: andps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_42_vary:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm1
-; CHECK-XOP-NEXT: vandps (%rdx), %xmm1, %xmm1
-; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm0
+; CHECK-XOP-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42,42,42]
+; CHECK-XOP-NEXT: vpcmov %xmm0, (%rsi), %xmm1, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -579,33 +574,29 @@ define <4 x i32> @out_constant_42_vary_i
define <4 x i32> @in_constant_42_vary_invmask(<4 x i32> *%px, <4 x i32> *%py, <4 x i32> *%pmask) {
; CHECK-SSE1-LABEL: in_constant_42_vary_invmask:
; CHECK-SSE1: # %bb.0:
-; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0
-; CHECK-SSE1-NEXT: movaps (%rcx), %xmm1
-; CHECK-SSE1-NEXT: movaps {{.*#+}} xmm2 = [5.885454e-44,5.885454e-44,5.885454e-44,5.885454e-44]
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm2
-; CHECK-SSE1-NEXT: andnps %xmm2, %xmm1
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT: movaps (%rdx), %xmm1
+; CHECK-SSE1-NEXT: andps %xmm0, %xmm1
+; CHECK-SSE1-NEXT: andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE1-NEXT: orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi)
; CHECK-SSE1-NEXT: movq %rdi, %rax
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: in_constant_42_vary_invmask:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1
; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: movaps {{.*#+}} xmm2 = [42,42,42,42]
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm2
-; CHECK-SSE2-NEXT: andnps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1
+; CHECK-SSE2-NEXT: andps %xmm0, %xmm1
+; CHECK-SSE2-NEXT: andnps {{.*}}(%rip), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_constant_42_vary_invmask:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0
-; CHECK-XOP-NEXT: vmovaps (%rdx), %xmm1
-; CHECK-XOP-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm2
-; CHECK-XOP-NEXT: vandnps %xmm2, %xmm1, %xmm1
-; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rsi), %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT: vpcmov %xmm1, {{.*}}(%rip), %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
Modified: llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll?rev=332904&r1=332903&r2=332904&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll (original)
+++ llvm/trunk/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll Mon May 21 14:41:02 2018
@@ -2607,16 +2607,14 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2
;
; CHECK-SSE2-LABEL: in_v2i8:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v2i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%n0 = xor <2 x i8> %x, %y
%n1 = and <2 x i8> %n0, %mask
@@ -2693,16 +2691,14 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4
;
; CHECK-SSE2-LABEL: in_v4i8:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v4i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%n0 = xor <4 x i8> %x, %y
%n1 = and <4 x i8> %n0, %mask
@@ -2737,16 +2733,14 @@ define <2 x i16> @in_v2i16(<2 x i16> %x,
;
; CHECK-SSE2-LABEL: in_v2i16:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v2i16:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%n0 = xor <2 x i16> %x, %y
%n1 = and <2 x i16> %n0, %mask
@@ -2895,16 +2889,14 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8
;
; CHECK-SSE2-LABEL: in_v8i8:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v8i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%n0 = xor <8 x i8> %x, %y
%n1 = and <8 x i8> %n0, %mask
@@ -2963,16 +2955,14 @@ define <4 x i16> @in_v4i16(<4 x i16> %x,
;
; CHECK-SSE2-LABEL: in_v4i16:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v4i16:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%n0 = xor <4 x i16> %x, %y
%n1 = and <4 x i16> %n0, %mask
@@ -3007,16 +2997,14 @@ define <2 x i32> @in_v2i32(<2 x i32> %x,
;
; CHECK-SSE2-LABEL: in_v2i32:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v2i32:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%n0 = xor <2 x i32> %x, %y
%n1 = and <2 x i32> %n0, %mask
@@ -3273,16 +3261,14 @@ define <16 x i8> @in_v16i8(<16 x i8> %x,
;
; CHECK-SSE2-LABEL: in_v16i8:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v16i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%n0 = xor <16 x i8> %x, %y
%n1 = and <16 x i8> %n0, %mask
@@ -3401,16 +3387,14 @@ define <8 x i16> @in_v8i16(<8 x i16> %x,
;
; CHECK-SSE2-LABEL: in_v8i16:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v8i16:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%n0 = xor <8 x i16> %x, %y
%n1 = and <8 x i16> %n0, %mask
@@ -3452,30 +3436,29 @@ define <4 x i32> @in_v4i32(<4 x i32> *%p
;
; CHECK-SSE1-LABEL: in_v4i32:
; CHECK-SSE1: # %bb.0:
-; CHECK-SSE1-NEXT: movaps (%rdx), %xmm0
-; CHECK-SSE1-NEXT: movaps (%rsi), %xmm1
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: andps (%rcx), %xmm1
-; CHECK-SSE1-NEXT: xorps %xmm0, %xmm1
-; CHECK-SSE1-NEXT: movaps %xmm1, (%rdi)
+; CHECK-SSE1-NEXT: movaps (%rcx), %xmm0
+; CHECK-SSE1-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE1-NEXT: andnps (%rdx), %xmm1
+; CHECK-SSE1-NEXT: andps (%rsi), %xmm0
+; CHECK-SSE1-NEXT: orps %xmm1, %xmm0
+; CHECK-SSE1-NEXT: movaps %xmm0, (%rdi)
; CHECK-SSE1-NEXT: movq %rdi, %rax
; CHECK-SSE1-NEXT: retq
;
; CHECK-SSE2-LABEL: in_v4i32:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps (%rsi), %xmm1
-; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
-; CHECK-SSE2-NEXT: andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm1
+; CHECK-SSE2-NEXT: andnps (%rsi), %xmm1
+; CHECK-SSE2-NEXT: andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v4i32:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps (%rsi), %xmm0
-; CHECK-XOP-NEXT: vxorps (%rdi), %xmm0, %xmm1
-; CHECK-XOP-NEXT: vandps (%rdx), %xmm1, %xmm1
-; CHECK-XOP-NEXT: vxorps %xmm0, %xmm1, %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdi), %xmm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %xmm1
+; CHECK-XOP-NEXT: vpcmov %xmm1, (%rsi), %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i32>, <4 x i32> *%px, align 16
%y = load <4 x i32>, <4 x i32> *%py, align 16
@@ -3513,16 +3496,14 @@ define <2 x i64> @in_v2i64(<2 x i64> %x,
;
; CHECK-SSE2-LABEL: in_v2i64:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
; CHECK-SSE2-NEXT: andps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm1, %xmm0
+; CHECK-SSE2-NEXT: andnps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v2i64:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vandps %xmm2, %xmm0, %xmm0
-; CHECK-XOP-NEXT: vxorps %xmm1, %xmm0, %xmm0
+; CHECK-XOP-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0
; CHECK-XOP-NEXT: retq
%n0 = xor <2 x i64> %x, %y
%n1 = and <2 x i64> %n0, %mask
@@ -4067,24 +4048,23 @@ define <32 x i8> @in_v32i8(<32 x i8> *%p
;
; CHECK-SSE2-LABEL: in_v32i8:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps (%rsi), %xmm2
-; CHECK-SSE2-NEXT: movaps 16(%rsi), %xmm3
-; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: movaps 16(%rdi), %xmm1
-; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1
-; CHECK-SSE2-NEXT: andps 16(%rdx), %xmm1
-; CHECK-SSE2-NEXT: andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2
+; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2
+; CHECK-SSE2-NEXT: andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2
+; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm1
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v32i8:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps (%rsi), %ymm0
-; CHECK-XOP-NEXT: vxorps (%rdi), %ymm0, %ymm1
-; CHECK-XOP-NEXT: vandps (%rdx), %ymm1, %ymm1
-; CHECK-XOP-NEXT: vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT: retq
%x = load <32 x i8>, <32 x i8> *%px, align 32
%y = load <32 x i8>, <32 x i8> *%py, align 32
@@ -4402,24 +4382,23 @@ define <16 x i16> @in_v16i16(<16 x i16>
;
; CHECK-SSE2-LABEL: in_v16i16:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps (%rsi), %xmm2
-; CHECK-SSE2-NEXT: movaps 16(%rsi), %xmm3
-; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: movaps 16(%rdi), %xmm1
-; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1
-; CHECK-SSE2-NEXT: andps 16(%rdx), %xmm1
-; CHECK-SSE2-NEXT: andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2
+; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2
+; CHECK-SSE2-NEXT: andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2
+; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm1
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v16i16:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps (%rsi), %ymm0
-; CHECK-XOP-NEXT: vxorps (%rdi), %ymm0, %ymm1
-; CHECK-XOP-NEXT: vandps (%rdx), %ymm1, %ymm1
-; CHECK-XOP-NEXT: vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT: retq
%x = load <16 x i16>, <16 x i16> *%px, align 32
%y = load <16 x i16>, <16 x i16> *%py, align 32
@@ -4571,24 +4550,23 @@ define <8 x i32> @in_v8i32(<8 x i32> *%p
;
; CHECK-SSE2-LABEL: in_v8i32:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps (%rsi), %xmm2
-; CHECK-SSE2-NEXT: movaps 16(%rsi), %xmm3
-; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: movaps 16(%rdi), %xmm1
-; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1
-; CHECK-SSE2-NEXT: andps 16(%rdx), %xmm1
-; CHECK-SSE2-NEXT: andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2
+; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2
+; CHECK-SSE2-NEXT: andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2
+; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm1
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v8i32:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps (%rsi), %ymm0
-; CHECK-XOP-NEXT: vxorps (%rdi), %ymm0, %ymm1
-; CHECK-XOP-NEXT: vandps (%rdx), %ymm1, %ymm1
-; CHECK-XOP-NEXT: vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT: retq
%x = load <8 x i32>, <8 x i32> *%px, align 32
%y = load <8 x i32>, <8 x i32> *%py, align 32
@@ -4664,24 +4642,23 @@ define <4 x i64> @in_v4i64(<4 x i64> *%p
;
; CHECK-SSE2-LABEL: in_v4i64:
; CHECK-SSE2: # %bb.0:
-; CHECK-SSE2-NEXT: movaps (%rsi), %xmm2
-; CHECK-SSE2-NEXT: movaps 16(%rsi), %xmm3
-; CHECK-SSE2-NEXT: movaps (%rdi), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: movaps 16(%rdi), %xmm1
-; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1
-; CHECK-SSE2-NEXT: andps 16(%rdx), %xmm1
-; CHECK-SSE2-NEXT: andps (%rdx), %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm2, %xmm0
-; CHECK-SSE2-NEXT: xorps %xmm3, %xmm1
+; CHECK-SSE2-NEXT: movaps (%rdx), %xmm0
+; CHECK-SSE2-NEXT: movaps 16(%rdx), %xmm1
+; CHECK-SSE2-NEXT: movaps %xmm0, %xmm2
+; CHECK-SSE2-NEXT: andnps (%rsi), %xmm2
+; CHECK-SSE2-NEXT: andps (%rdi), %xmm0
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm0
+; CHECK-SSE2-NEXT: movaps %xmm1, %xmm2
+; CHECK-SSE2-NEXT: andnps 16(%rsi), %xmm2
+; CHECK-SSE2-NEXT: andps 16(%rdi), %xmm1
+; CHECK-SSE2-NEXT: orps %xmm2, %xmm1
; CHECK-SSE2-NEXT: retq
;
; CHECK-XOP-LABEL: in_v4i64:
; CHECK-XOP: # %bb.0:
-; CHECK-XOP-NEXT: vmovaps (%rsi), %ymm0
-; CHECK-XOP-NEXT: vxorps (%rdi), %ymm0, %ymm1
-; CHECK-XOP-NEXT: vandps (%rdx), %ymm1, %ymm1
-; CHECK-XOP-NEXT: vxorps %ymm0, %ymm1, %ymm0
+; CHECK-XOP-NEXT: vmovdqa (%rdi), %ymm0
+; CHECK-XOP-NEXT: vmovdqa (%rdx), %ymm1
+; CHECK-XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0
; CHECK-XOP-NEXT: retq
%x = load <4 x i64>, <4 x i64> *%px, align 32
%y = load <4 x i64>, <4 x i64> *%py, align 32
More information about the llvm-commits
mailing list