[llvm] r275276 - [x86][SSE/AVX] optimize pcmp results better (PR28484)
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 13 09:04:07 PDT 2016
Author: spatel
Date: Wed Jul 13 11:04:07 2016
New Revision: 275276
URL: http://llvm.org/viewvc/llvm-project?rev=275276&view=rev
Log:
[x86][SSE/AVX] optimize pcmp results better (PR28484)
We know that pcmp produces all-ones/all-zeros bitmasks, so we can use that behavior to avoid unnecessary constant loading.
One could argue that load+and is actually a better solution for some CPUs (Intel big cores) because shifts don't have the
same throughput potential as load+and on those cores, but that should be handled as a CPU-specific later transformation if
it ever comes up. Removing the load is the more general x86 optimization. Note that the uneven usage of vpbroadcast in the
test cases is filed as PR28505:
https://llvm.org/bugs/show_bug.cgi?id=28505
Differential Revision: http://reviews.llvm.org/D22225
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx512-ext.ll
llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/trunk/test/CodeGen/X86/shift-pcmp.ll
llvm/trunk/test/CodeGen/X86/vector-pcmp.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=275276&r1=275275&r2=275276&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jul 13 11:04:07 2016
@@ -28186,6 +28186,42 @@ static SDValue convertIntLogicToFPLogic(
return SDValue();
}
+/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
+/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
+/// eliminate loading the vector constant mask value. This relies on the fact
+/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
+static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+ SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
+ SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+
+ // TODO: Use AssertSext to mark any nodes that have the property of producing
+ // all-ones or all-zeros. Then check for that node rather than particular
+ // opcodes.
+ if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+ return SDValue();
+
+ // The existence of the PCMP node guarantees that we have the required SSE2 or
+ // AVX2 for a shift of this vector type, but there is no vector shift by
+ // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
+ // masked compare nodes, so they should not make it here.
+ EVT VT0 = Op0.getValueType();
+ EVT VT1 = Op1.getValueType();
+ unsigned EltBitWidth = VT0.getScalarType().getSizeInBits();
+ if (VT0 != VT1 || EltBitWidth == 8)
+ return SDValue();
+
+ assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+
+ APInt SplatVal;
+ if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+ SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
+ return DAG.getBitcast(N->getValueType(0), Shift);
+}
+
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -28204,6 +28240,9 @@ static SDValue combineAnd(SDNode *N, Sel
if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
return R;
+ if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+ return ShiftRight;
+
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
Modified: llvm/trunk/test/CodeGen/X86/avx512-ext.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-ext.ll?rev=275276&r1=275275&r2=275276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-ext.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-ext.ll Wed Jul 13 11:04:07 2016
@@ -1919,10 +1919,9 @@ define <32 x i16> @zext_32xi1_to_32xi16(
; KNL-LABEL: zext_32xi1_to_32xi16:
; KNL: ## BB#0:
; KNL-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
-; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; KNL-NEXT: vpand %ymm2, %ymm0, %ymm0
+; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
; KNL-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; KNL-NEXT: vpand %ymm2, %ymm1, %ymm1
+; KNL-NEXT: vpsrlw $15, %ymm1, %ymm1
; KNL-NEXT: retq
;
; SKX-LABEL: zext_32xi1_to_32xi16:
@@ -1939,7 +1938,7 @@ define <16 x i16> @zext_16xi1_to_16xi16(
; KNL-LABEL: zext_16xi1_to_16xi16:
; KNL: ## BB#0:
; KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT: vpsrlw $15, %ymm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_16xi1_to_16xi16:
@@ -1983,8 +1982,7 @@ define <4 x i32> @zext_4xi1_to_4x32(<4 x
; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; KNL-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; KNL-NEXT: vpsrld $31, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_4xi1_to_4x32:
@@ -2007,7 +2005,7 @@ define <2 x i64> @zext_2xi1_to_2xi64(<2
; KNL-NEXT: vpand %xmm2, %xmm1, %xmm1
; KNL-NEXT: vpand %xmm2, %xmm0, %xmm0
; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: zext_2xi1_to_2xi64:
Modified: llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll?rev=275276&r1=275275&r2=275276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll Wed Jul 13 11:04:07 2016
@@ -1215,7 +1215,7 @@ define <2 x i64> @test45(<2 x i16> %x, <
; KNL-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
; KNL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
; KNL-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; KNL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT: vpsrlq $63, %xmm0, %xmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test45:
Modified: llvm/trunk/test/CodeGen/X86/shift-pcmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shift-pcmp.ll?rev=275276&r1=275275&r2=275276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shift-pcmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shift-pcmp.ll Wed Jul 13 11:04:07 2016
@@ -26,14 +26,14 @@ define <8 x i16> @bar(<8 x i16> %a, <8 x
; SSE-LABEL: bar:
; SSE: # BB#0:
; SSE-NEXT: pcmpeqw %xmm1, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $15, %xmm0
; SSE-NEXT: psllw $5, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: bar:
; AVX: # BB#0:
; AVX-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX-NEXT: vpsllw $5, %xmm0, %xmm0
; AVX-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-pcmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-pcmp.ll?rev=275276&r1=275275&r2=275276&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-pcmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-pcmp.ll Wed Jul 13 11:04:07 2016
@@ -294,10 +294,9 @@ define <16 x i16> @cmpeq_zext_v16i16(<16
; SSE-LABEL: cmpeq_zext_v16i16:
; SSE: # BB#0:
; SSE-NEXT: pcmpeqw %xmm2, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1]
-; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: psrlw $15, %xmm0
; SSE-NEXT: pcmpeqw %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: psrlw $15, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: cmpeq_zext_v16i16:
@@ -313,7 +312,7 @@ define <16 x i16> @cmpeq_zext_v16i16(<16
; AVX2-LABEL: cmpeq_zext_v16i16:
; AVX2: # BB#0:
; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0
; AVX2-NEXT: retq
;
%cmp = icmp eq <16 x i16> %a, %b
@@ -325,21 +324,14 @@ define <4 x i32> @cmpeq_zext_v4i32(<4 x
; SSE-LABEL: cmpeq_zext_v4i32:
; SSE: # BB#0:
; SSE-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrld $31, %xmm0
; SSE-NEXT: retq
;
-; AVX1-LABEL: cmpeq_zext_v4i32:
-; AVX1: # BB#0:
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: cmpeq_zext_v4i32:
-; AVX2: # BB#0:
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
+; AVX-LABEL: cmpeq_zext_v4i32:
+; AVX: # BB#0:
+; AVX-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpsrld $31, %xmm0, %xmm0
+; AVX-NEXT: retq
;
%cmp = icmp eq <4 x i32> %a, %b
%zext = zext <4 x i1> %cmp to <4 x i32>
@@ -363,10 +355,9 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x
; SSE42-LABEL: cmpeq_zext_v4i64:
; SSE42: # BB#0:
; SSE42-NEXT: pcmpeqq %xmm2, %xmm0
-; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [1,1]
-; SSE42-NEXT: pand %xmm2, %xmm0
+; SSE42-NEXT: psrlq $63, %xmm0
; SSE42-NEXT: pcmpeqq %xmm3, %xmm1
-; SSE42-NEXT: pand %xmm2, %xmm1
+; SSE42-NEXT: psrlq $63, %xmm1
; SSE42-NEXT: retq
;
; AVX1-LABEL: cmpeq_zext_v4i64:
@@ -382,8 +373,7 @@ define <4 x i64> @cmpeq_zext_v4i64(<4 x
; AVX2-LABEL: cmpeq_zext_v4i64:
; AVX2: # BB#0:
; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $63, %ymm0, %ymm0
; AVX2-NEXT: retq
;
%cmp = icmp eq <4 x i64> %a, %b
@@ -426,13 +416,13 @@ define <8 x i16> @cmpgt_zext_v8i16(<8 x
; SSE-LABEL: cmpgt_zext_v8i16:
; SSE: # BB#0:
; SSE-NEXT: pcmpgtw %xmm1, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE-NEXT: psrlw $15, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: cmpgt_zext_v8i16:
; AVX: # BB#0:
; AVX-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpsrlw $15, %xmm0, %xmm0
; AVX-NEXT: retq
;
%cmp = icmp sgt <8 x i16> %a, %b
@@ -444,10 +434,9 @@ define <8 x i32> @cmpgt_zext_v8i32(<8 x
; SSE-LABEL: cmpgt_zext_v8i32:
; SSE: # BB#0:
; SSE-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; SSE-NEXT: pand %xmm2, %xmm0
+; SSE-NEXT: psrld $31, %xmm0
; SSE-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
+; SSE-NEXT: psrld $31, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: cmpgt_zext_v8i32:
@@ -463,8 +452,7 @@ define <8 x i32> @cmpgt_zext_v8i32(<8 x
; AVX2-LABEL: cmpgt_zext_v8i32:
; AVX2: # BB#0:
; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpsrld $31, %ymm0, %ymm0
; AVX2-NEXT: retq
;
%cmp = icmp sgt <8 x i32> %a, %b
@@ -492,13 +480,13 @@ define <2 x i64> @cmpgt_zext_v2i64(<2 x
; SSE42-LABEL: cmpgt_zext_v2i64:
; SSE42: # BB#0:
; SSE42-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE42-NEXT: psrlq $63, %xmm0
; SSE42-NEXT: retq
;
; AVX-LABEL: cmpgt_zext_v2i64:
; AVX: # BB#0:
; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpsrlq $63, %xmm0, %xmm0
; AVX-NEXT: retq
;
%cmp = icmp sgt <2 x i64> %a, %b
More information about the llvm-commits
mailing list