[llvm] e0cff30 - [X86][SSE] LowerVectorAllZeroTest - add support for pre-SSE41 targets
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 14 05:46:07 PDT 2020
Author: Simon Pilgrim
Date: 2020-06-14T13:41:56+01:00
New Revision: e0cff30c17d585a5618ce00266abf4f2fdb5d415
URL: https://github.com/llvm/llvm-project/commit/e0cff30c17d585a5618ce00266abf4f2fdb5d415
DIFF: https://github.com/llvm/llvm-project/commit/e0cff30c17d585a5618ce00266abf4f2fdb5d415.diff
LOG: [X86][SSE] LowerVectorAllZeroTest - add support for pre-SSE41 targets
Even without PTEST, we can still efficiently perform an OR reduction as PMOVMSKB(PCMPEQB(X,0)) == 0, avoiding xmm->gpr extractions.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/pr45378.ll
llvm/test/CodeGen/X86/ptest.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5a0222d1c211..ddea6fa4df95 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -21346,13 +21346,14 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
return true;
}
-// Check whether an OR'd tree is PTEST-able.
+// Check whether an OR'd tree is PTEST-able, or if we can fallback to
+// CMP(MOVMSK(PCMPEQB(X,0))).
static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
const X86Subtarget &Subtarget,
SelectionDAG &DAG, SDValue &X86CC) {
assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
- if (!Subtarget.hasSSE41() || !Op->hasOneUse())
+ if (!Subtarget.hasSSE2() || !Op->hasOneUse())
return SDValue();
SmallVector<SDValue, 8> VecIns;
@@ -21365,9 +21366,11 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
return SDValue();
SDLoc DL(Op);
- MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ bool UsePTEST = Subtarget.hasSSE41();
+ MVT TestVT =
+ VT.is128BitVector() ? (UsePTEST ? MVT::v2i64 : MVT::v16i8) : MVT::v4i64;
- // Cast all vectors into TestVT for PTEST.
+ // Cast all vectors into TestVT for PTEST/PCMPEQ.
for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
@@ -21382,7 +21385,16 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
DL, MVT::i8);
- return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
+
+ if (UsePTEST)
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(),
+ VecIns.back());
+
+ SDValue Result = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, VecIns.back(),
+ getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+ Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+ return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+ DAG.getConstant(0xFFFF, DL, MVT::i32));
}
/// return true if \c Op has a use that doesn't just read flags.
@@ -22530,12 +22542,12 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
return BT;
}
- // Try to use PTEST for a tree ORs equality compared with 0.
+ // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
// TODO: We could do AND tree with all 1s as well by using the C flag.
if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
- if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
- return PTEST;
+ if (SDValue CmpZ = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
+ return CmpZ;
}
// Try to lower using KORTEST or KTEST.
diff --git a/llvm/test/CodeGen/X86/pr45378.ll b/llvm/test/CodeGen/X86/pr45378.ll
index b33d4c0ab22a..c092378a2a01 100644
--- a/llvm/test/CodeGen/X86/pr45378.ll
+++ b/llvm/test/CodeGen/X86/pr45378.ll
@@ -57,10 +57,10 @@ define i1 @parseHeaders2_scalar_or(i64 * %ptr) nounwind {
; SSE2-LABEL: parseHeaders2_scalar_or:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: orq %rax, %rcx
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
+; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: sete %al
; SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/ptest.ll b/llvm/test/CodeGen/X86/ptest.ll
index 605dfc6e2165..13ca7195bca3 100644
--- a/llvm/test/CodeGen/X86/ptest.ll
+++ b/llvm/test/CodeGen/X86/ptest.ll
@@ -7,10 +7,10 @@
define i32 @veccond128(<4 x i32> %input) {
; SSE2-LABEL: veccond128:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: orq %rax, %rcx
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
+; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: je .LBB0_2
; SSE2-NEXT: # %bb.1: # %if-true-block
; SSE2-NEXT: xorl %eax, %eax
@@ -53,15 +53,11 @@ endif-block:
define i32 @veccond256(<8 x i32> %input) {
; SSE2-LABEL: veccond256:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rcx
-; SSE2-NEXT: orq %rax, %rcx
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: orq %rax, %rdx
-; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %eax
+; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: je .LBB1_2
; SSE2-NEXT: # %bb.1: # %if-true-block
; SSE2-NEXT: xorl %eax, %eax
@@ -107,25 +103,13 @@ endif-block:
define i32 @veccond512(<16 x i32> %input) {
; SSE2-LABEL: veccond512:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rdx
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rsi
-; SSE2-NEXT: orq %rdx, %rsi
-; SSE2-NEXT: orq %rax, %rsi
-; SSE2-NEXT: orq %rcx, %rsi
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: movq %xmm3, %rdx
-; SSE2-NEXT: movq %xmm1, %rdi
-; SSE2-NEXT: orq %rdx, %rdi
-; SSE2-NEXT: orq %rax, %rdi
-; SSE2-NEXT: orq %rcx, %rdi
-; SSE2-NEXT: orq %rsi, %rdi
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %eax
+; SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; SSE2-NEXT: je .LBB2_2
; SSE2-NEXT: # %bb.1: # %if-true-block
; SSE2-NEXT: xorl %eax, %eax
@@ -204,11 +188,11 @@ endif-block:
define i32 @vectest128(<4 x i32> %input) {
; SSE2-LABEL: vectest128:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rdx
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %ecx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
@@ -234,16 +218,12 @@ define i32 @vectest128(<4 x i32> %input) {
define i32 @vectest256(<8 x i32> %input) {
; SSE2-LABEL: vectest256:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rcx
-; SSE2-NEXT: orq %rax, %rcx
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: orq %rax, %rdx
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %ecx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
@@ -271,26 +251,14 @@ define i32 @vectest256(<8 x i32> %input) {
define i32 @vectest512(<16 x i32> %input) {
; SSE2-LABEL: vectest512:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rax
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rdx
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rsi
-; SSE2-NEXT: orq %rdx, %rsi
-; SSE2-NEXT: orq %rax, %rsi
-; SSE2-NEXT: orq %rcx, %rsi
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: movq %xmm3, %rdx
-; SSE2-NEXT: movq %xmm1, %rdi
-; SSE2-NEXT: orq %rdx, %rdi
-; SSE2-NEXT: orq %rax, %rdi
-; SSE2-NEXT: orq %rcx, %rdi
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %ecx
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: orq %rsi, %rdi
+; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: setne %al
; SSE2-NEXT: retq
;
@@ -347,10 +315,10 @@ define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
; SSE2-LABEL: vecsel128:
; SSE2: # %bb.0:
; SSE2-NEXT: movl %edi, %eax
-; SSE2-NEXT: movq %xmm0, %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rdx
-; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %ecx
+; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: cmovel %esi, %eax
; SSE2-NEXT: retq
;
@@ -377,15 +345,11 @@ define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
; SSE2-LABEL: vecsel256:
; SSE2: # %bb.0:
; SSE2-NEXT: movl %edi, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rcx
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rdx
-; SSE2-NEXT: orq %rcx, %rdx
-; SSE2-NEXT: movq %xmm1, %rcx
-; SSE2-NEXT: movq %xmm0, %rdi
-; SSE2-NEXT: orq %rcx, %rdi
-; SSE2-NEXT: orq %rdx, %rdi
+; SSE2-NEXT: por %xmm1, %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %ecx
+; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: cmovel %esi, %eax
; SSE2-NEXT: retq
;
@@ -414,25 +378,13 @@ define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
; SSE2-LABEL: vecsel512:
; SSE2: # %bb.0:
; SSE2-NEXT: movl %edi, %eax
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %r8
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rdx
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rdi
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm4, %rcx
-; SSE2-NEXT: orq %rdi, %rcx
-; SSE2-NEXT: orq %r8, %rcx
-; SSE2-NEXT: orq %rdx, %rcx
-; SSE2-NEXT: movq %xmm2, %r9
-; SSE2-NEXT: movq %xmm0, %r8
-; SSE2-NEXT: movq %xmm3, %rdi
-; SSE2-NEXT: movq %xmm1, %rdx
-; SSE2-NEXT: orq %rdi, %rdx
-; SSE2-NEXT: orq %r9, %rdx
-; SSE2-NEXT: orq %r8, %rdx
-; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: por %xmm3, %xmm1
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: por %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT: pmovmskb %xmm0, %ecx
+; SSE2-NEXT: cmpl $65535, %ecx # imm = 0xFFFF
; SSE2-NEXT: cmovel %esi, %eax
; SSE2-NEXT: retq
;
More information about the llvm-commits
mailing list