[llvm] 0ee1db2 - [X86] Try to avoid casts around logical vector ops recursively.
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 19 17:41:15 PST 2020
Author: Florian Hahn
Date: 2020-01-19T17:22:43-08:00
New Revision: 0ee1db2d1d7aab9e2736c664ebd27fa23919964e
URL: https://github.com/llvm/llvm-project/commit/0ee1db2d1d7aab9e2736c664ebd27fa23919964e
DIFF: https://github.com/llvm/llvm-project/commit/0ee1db2d1d7aab9e2736c664ebd27fa23919964e.diff
LOG: [X86] Try to avoid casts around logical vector ops recursively.
Currently PromoteMaskArithemtic only looks at a single operation to
skip casts. This means we miss cases where we combine multiple masks.
This patch updates PromoteMaskArithemtic to try to recursively promote
AND/XOR/AND nodes that terminate in truncates of the right size or
constant vectors.
Reviewers: craig.topper, RKSimon, spatel
Reviewed By: RKSimon
Differential Revision: https://reviews.llvm.org/D72524
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/v8i1-masks.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5a1dd9f9d302..e24e22104ce4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39898,6 +39898,65 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
}
+// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
+// logical operations, like in the example below.
+// or (and (truncate x, truncate y)),
+// (xor (truncate z, build_vector (constants)))
+// Given a target type \p VT, we generate
+// or (and x, y), (xor z, zext(build_vector (constants)))
+// given x, y and z are of type \p VT. We can do so, if operands are either
+// truncates from VT types, the second operand is a vector of constants or can
+// be recursively promoted.
+static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
+ unsigned Depth) {
+ // Limit recursion to avoid excessive compile times.
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue();
+
+ if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
+ N->getOpcode() != ISD::OR)
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDLoc DL(N);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
+ return SDValue();
+
+ if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
+ N0 = NN0;
+ else {
+ // The Left side has to be a trunc.
+ if (N0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ // The type of the truncated inputs.
+ if (N0.getOperand(0).getValueType() != VT)
+ return SDValue();
+
+ N0 = N0.getOperand(0);
+ }
+
+ if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
+ N1 = NN1;
+ else {
+ // The right side has to be a 'trunc' or a constant vector.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
+ N1.getOperand(0).getValueType() == VT;
+ if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
+ return SDValue();
+
+ if (RHSTrunc)
+ N1 = N1.getOperand(0);
+ else
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
+ }
+
+ return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
+}
+
// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
// register. In most cases we actually compare or select YMM-sized registers
// and mixing the two types creates horrible code. This method optimizes
@@ -39909,6 +39968,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
assert(VT.isVector() && "Expected vector type");
+ SDLoc DL(N);
assert((N->getOpcode() == ISD::ANY_EXTEND ||
N->getOpcode() == ISD::ZERO_EXTEND ||
N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
@@ -39916,46 +39976,11 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
SDValue Narrow = N->getOperand(0);
EVT NarrowVT = Narrow.getValueType();
- if (Narrow->getOpcode() != ISD::XOR &&
- Narrow->getOpcode() != ISD::AND &&
- Narrow->getOpcode() != ISD::OR)
- return SDValue();
-
- SDValue N0 = Narrow->getOperand(0);
- SDValue N1 = Narrow->getOperand(1);
- SDLoc DL(Narrow);
-
- // The Left side has to be a trunc.
- if (N0.getOpcode() != ISD::TRUNCATE)
- return SDValue();
-
- // The type of the truncated inputs.
- if (N0.getOperand(0).getValueType() != VT)
- return SDValue();
-
- // The right side has to be a 'trunc' or a constant vector.
- bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
- N1.getOperand(0).getValueType() == VT;
- if (!RHSTrunc &&
- !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
- return SDValue();
-
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
- if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
- return SDValue();
-
- // Set N0 and N1 to hold the inputs to the new wide operation.
- N0 = N0.getOperand(0);
- if (RHSTrunc)
- N1 = N1.getOperand(0);
- else
- N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
-
// Generate the wide operation.
- SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
- unsigned Opcode = N->getOpcode();
- switch (Opcode) {
+ SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
+ if (!Op)
+ return SDValue();
+ switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode");
case ISD::ANY_EXTEND:
return Op;
diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll
index 82ff35772802..a661e77a98e8 100644
--- a/llvm/test/CodeGen/X86/v8i1-masks.ll
+++ b/llvm/test/CodeGen/X86/v8i1-masks.ll
@@ -212,18 +212,10 @@ define <8 x i32> @three_ands(<8 x float> %x) {
; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-NEXT: vcmpltps LCPI4_1, %ymm0, %ymm2
-; X32-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X32-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: three_ands:
@@ -231,18 +223,10 @@ define <8 x i32> @three_ands(<8 x float> %x) {
; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: three_ands:
@@ -251,15 +235,10 @@ define <8 x i32> @three_ands(<8 x float> %x) {
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: three_ands:
@@ -268,15 +247,10 @@ define <8 x i32> @three_ands(<8 x float> %x) {
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -294,22 +268,12 @@ define <8 x i32> @four_ands(<8 x float> %x) {
; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-NEXT: vcmpltps LCPI5_1, %ymm0, %ymm2
-; X32-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X32-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
; X32-NEXT: vcmpneqps LCPI5_2, %ymm0, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-NEXT: vpand %xmm0, %xmm2, %xmm0
-; X32-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: four_ands:
@@ -317,22 +281,12 @@ define <8 x i32> @four_ands(<8 x float> %x) {
; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-NEXT: vpand %xmm0, %xmm2, %xmm0
-; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: four_ands:
@@ -341,20 +295,13 @@ define <8 x i32> @four_ands(<8 x float> %x) {
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
-; X32-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: four_ands:
@@ -363,20 +310,13 @@ define <8 x i32> @four_ands(<8 x float> %x) {
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
-; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -396,26 +336,14 @@ define <8 x i32> @five_ands(<8 x float> %x) {
; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-NEXT: vcmpltps LCPI6_1, %ymm0, %ymm2
-; X32-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X32-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
; X32-NEXT: vcmpneqps LCPI6_2, %ymm0, %ymm3
-; X32-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X32-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X32-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
; X32-NEXT: vcmpneqps LCPI6_3, %ymm0, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-NEXT: vpand %xmm0, %xmm2, %xmm0
-; X32-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: five_ands:
@@ -423,26 +351,14 @@ define <8 x i32> @five_ands(<8 x float> %x) {
; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm3
-; X64-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X64-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-NEXT: vpand %xmm0, %xmm2, %xmm0
-; X64-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: five_ands:
@@ -451,25 +367,16 @@ define <8 x i32> @five_ands(<8 x float> %x) {
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X32-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X32-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
-; X32-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: five_ands:
@@ -478,25 +385,16 @@ define <8 x i32> @five_ands(<8 x float> %x) {
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X64-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpand %xmm0, %xmm2, %xmm0
-; X64-AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -560,18 +458,10 @@ define <8 x i32> @three_or(<8 x float> %x) {
; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-NEXT: vcmpltps LCPI8_1, %ymm0, %ymm2
-; X32-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X32-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: three_or:
@@ -579,18 +469,10 @@ define <8 x i32> @three_or(<8 x float> %x) {
; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: three_or:
@@ -599,15 +481,10 @@ define <8 x i32> @three_or(<8 x float> %x) {
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: three_or:
@@ -616,15 +493,10 @@ define <8 x i32> @three_or(<8 x float> %x) {
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -642,22 +514,12 @@ define <8 x i32> @four_or(<8 x float> %x) {
; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-NEXT: vcmpltps LCPI9_1, %ymm0, %ymm2
-; X32-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X32-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-NEXT: vorps %ymm3, %ymm2, %ymm2
; X32-NEXT: vcmpneqps LCPI9_2, %ymm0, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-NEXT: vpor %xmm0, %xmm2, %xmm0
-; X32-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: four_or:
@@ -665,22 +527,12 @@ define <8 x i32> @four_or(<8 x float> %x) {
; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-NEXT: vpor %xmm0, %xmm2, %xmm0
-; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: four_or:
@@ -689,20 +541,13 @@ define <8 x i32> @four_or(<8 x float> %x) {
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
-; X32-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: four_or:
@@ -711,20 +556,13 @@ define <8 x i32> @four_or(<8 x float> %x) {
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
-; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -744,26 +582,14 @@ define <8 x i32> @five_or(<8 x float> %x) {
; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-NEXT: vcmpltps LCPI10_1, %ymm0, %ymm2
-; X32-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X32-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-NEXT: vorps %ymm3, %ymm2, %ymm2
; X32-NEXT: vcmpneqps LCPI10_2, %ymm0, %ymm3
-; X32-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X32-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X32-NEXT: vpor %xmm3, %xmm2, %xmm2
+; X32-NEXT: vorps %ymm3, %ymm2, %ymm2
; X32-NEXT: vcmpneqps LCPI10_3, %ymm0, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-NEXT: vpor %xmm0, %xmm2, %xmm0
-; X32-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: five_or:
@@ -771,26 +597,14 @@ define <8 x i32> @five_or(<8 x float> %x) {
; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm3
-; X64-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X64-NEXT: vpor %xmm3, %xmm2, %xmm2
+; X64-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-NEXT: vpor %xmm0, %xmm2, %xmm0
-; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: five_or:
@@ -799,25 +613,16 @@ define <8 x i32> @five_or(<8 x float> %x) {
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X32-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X32-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; X32-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
-; X32-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: five_or:
@@ -826,25 +631,16 @@ define <8 x i32> @five_or(<8 x float> %x) {
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X64-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
-; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -923,19 +719,11 @@ define <8 x i32> @four_or_and(<8 x float> %x) {
; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-NEXT: vcmpltps LCPI12_1, %ymm0, %ymm2
; X32-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X32-NEXT: vcmpneqps LCPI12_2, %ymm0, %ymm0
; X32-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: four_or_and:
@@ -944,19 +732,11 @@ define <8 x i32> @four_or_and(<8 x float> %x) {
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: four_or_and:
@@ -966,17 +746,12 @@ define <8 x i32> @four_or_and(<8 x float> %x) {
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: four_or_and:
@@ -986,17 +761,12 @@ define <8 x i32> @four_or_and(<8 x float> %x) {
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -1019,19 +789,11 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X32-NEXT: vorps %ymm1, %ymm2, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-NEXT: vcmpneqps LCPI13_2, %ymm0, %ymm2
+; X32-NEXT: vcmpneqps LCPI13_2, %ymm0, %ymm3
; X32-NEXT: vcmpneqps LCPI13_3, %ymm0, %ymm0
-; X32-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm3, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm2, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: five_or_and:
@@ -1042,19 +804,11 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X64-NEXT: vorps %ymm1, %ymm2, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm2
+; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm3
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm3, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm2, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: five_or_and:
@@ -1066,18 +820,13 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: five_or_and:
@@ -1089,18 +838,13 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -1123,19 +867,11 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) {
; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X32-NEXT: vcmpltps LCPI14_1, %ymm0, %ymm2
; X32-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X32-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X32-NEXT: vcmpneqps LCPI14_2, %ymm0, %ymm0
; X32-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: four_or_and_xor:
@@ -1144,19 +880,11 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) {
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm0
; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: four_or_and_xor:
@@ -1166,17 +894,12 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) {
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; X32-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X32-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: four_or_and_xor:
@@ -1186,17 +909,12 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) {
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2
; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -1215,100 +933,66 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
; X32: ## %bb.0: ## %entry
; X32-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X32-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; X32-NEXT: vcmpltps LCPI15_1, %ymm0, %ymm2
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-NEXT: vxorps %ymm3, %ymm2, %ymm2
-; X32-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; X32-NEXT: vcmpneqps LCPI15_2, %ymm0, %ymm3
+; X32-NEXT: vcmpneqps LCPI15_2, %ymm0, %ymm4
; X32-NEXT: vcmpneqps LCPI15_3, %ymm0, %ymm0
-; X32-NEXT: vandps %ymm0, %ymm3, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; X32-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vandps %ymm0, %ymm4, %ymm0
+; X32-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; X32-NEXT: vxorps %ymm0, %ymm2, %ymm0
+; X32-NEXT: vorps %ymm1, %ymm0, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: five_or_and_xor:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-NEXT: vxorps %ymm3, %ymm2, %ymm2
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm3
+; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm4
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm3, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; X64-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vandps %ymm0, %ymm4, %ymm0
+; X64-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; X64-NEXT: vxorps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: five_or_and_xor:
; X32-AVX2: ## %bb.0: ## %entry
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X32-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0
-; X32-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; X32-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm5, %ymm0, %ymm0
+; X32-AVX2-NEXT: vandps %ymm0, %ymm4, %ymm0
+; X32-AVX2-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; X32-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0
+; X32-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: five_or_and_xor:
; X64-AVX2: ## %bb.0: ## %entry
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0
-; X64-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; X64-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm5 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm5, %ymm0, %ymm0
+; X64-AVX2-NEXT: vandps %ymm0, %ymm4, %ymm0
+; X64-AVX2-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; X64-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0
+; X64-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
@@ -1331,26 +1015,14 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
; X32-NEXT: vcmpltps LCPI16_1, %ymm0, %ymm2
; X32-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-NEXT: vcmpneqps LCPI16_2, %ymm0, %ymm4
+; X32-NEXT: vandps %ymm4, %ymm3, %ymm3
; X32-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X32-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; X32-NEXT: vcmpneqps LCPI16_2, %ymm0, %ymm3
-; X32-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X32-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X32-NEXT: vpand %xmm3, %xmm2, %xmm2
-; X32-NEXT: vcmpneqps LCPI16_3, %ymm0, %ymm3
-; X32-NEXT: vxorps %ymm1, %ymm3, %ymm1
-; X32-NEXT: vextractf128 $1, %ymm1, %xmm3
-; X32-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
-; X32-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; X32-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; X32-NEXT: vcmpneqps LCPI16_3, %ymm0, %ymm2
+; X32-NEXT: vxorps %ymm1, %ymm2, %ymm1
; X32-NEXT: vcmpneqps LCPI16_4, %ymm0, %ymm0
-; X32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-NEXT: vpmovsxwd %xmm0, %xmm1
-; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-NEXT: vpmovsxwd %xmm0, %xmm0
-; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X32-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-NEXT: retl
;
; X64-LABEL: six_or_and_xor:
@@ -1360,26 +1032,14 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
; X64-NEXT: vcmpltps {{.*}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm4
+; X64-NEXT: vandps %ymm4, %ymm3, %ymm3
; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X64-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm3
-; X64-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X64-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X64-NEXT: vpand %xmm3, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm3
-; X64-NEXT: vxorps %ymm1, %ymm3, %ymm1
-; X64-NEXT: vextractf128 $1, %ymm1, %xmm3
-; X64-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm2
+; X64-NEXT: vxorps %ymm1, %ymm2, %ymm1
; X64-NEXT: vcmpneqps {{.*}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-NEXT: vpmovsxwd %xmm0, %xmm1
-; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X64-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
; X32-AVX2-LABEL: six_or_and_xor:
@@ -1390,26 +1050,17 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
; X32-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X32-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4
+; X32-AVX2-NEXT: vandps %ymm4, %ymm3, %ymm3
; X32-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X32-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X32-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X32-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X32-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X32-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1
-; X32-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
-; X32-AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
-; X32-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; X32-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X32-AVX2-NEXT: vxorps %ymm1, %ymm2, %ymm1
; X32-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
; X32-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X32-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X32-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X32-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X32-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X32-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X32-AVX2-NEXT: retl
;
; X64-AVX2-LABEL: six_or_and_xor:
@@ -1420,26 +1071,17 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2
; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm4
+; X64-AVX2-NEXT: vandps %ymm4, %ymm3, %ymm3
; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vextractf128 $1, %ymm3, %xmm4
-; X64-AVX2-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; X64-AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1
-; X64-AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
-; X64-AVX2-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
-; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
+; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X64-AVX2-NEXT: vxorps %ymm1, %ymm2, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; X64-AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; X64-AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
+; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
entry:
%cmp = fcmp oge <8 x float> %x, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
More information about the llvm-commits
mailing list