[llvm] r334378 - [DAGCombiner] match vector compare and select sizes with extload operand (PR37427)
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Sun Jun 10 16:09:50 PDT 2018
Author: spatel
Date: Sun Jun 10 16:09:50 2018
New Revision: 334378
URL: http://llvm.org/viewvc/llvm-project?rev=334378&view=rev
Log:
[DAGCombiner] match vector compare and select sizes with extload operand (PR37427)
This patch started off much more general and ambitious, but it's been a nightmare
seeing all the ways x86 vector codegen can go wrong.
So the code is still structured to allow extending easily, but it's currently
limited in several ways:
1. Only handle cases with an extending load.
2. Only handle cases with a zero constant compare.
3. Ignore setcc with vector bitmask (SetCCWidth != 1) - so AVX512 should be unaffected.
The motivating case from PR37427:
https://bugs.llvm.org/show_bug.cgi?id=37427
...is the 1st test, and that shows the expected win - we eliminated the unnecessary
intermediate cast.
There's a clear regression in the last test (sgt_zero_fp_select) because we longer
recognize a 'SHRUNKBLEND' opportunity. I think that general problem is also present
in sgt_zero, so I'll try to fix that in a follow-up. We need to match a sign-bit
setcc from a sign-extended operand and remove it.
Differential Revision: https://reviews.llvm.org/D47330
Modified:
llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/trunk/test/CodeGen/X86/vsel-cmp-load.ll
Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=334378&r1=334377&r2=334378&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Sun Jun 10 16:09:50 2018
@@ -7331,6 +7331,36 @@ SDValue DAGCombiner::visitVSELECT(SDNode
AddToWorklist(Add.getNode());
return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
}
+
+ // If this select has a condition (setcc) with narrower operands than the
+ // select, try to widen the compare to match the select width.
+ // TODO: This should be extended to handle any constant.
+ // TODO: This could be extended to handle non-loading patterns, but that
+ // requires thorough testing to avoid regressions.
+ if (isNullConstantOrNullSplatConstant(RHS)) {
+ EVT NarrowVT = LHS.getValueType();
+ EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
+ EVT SetCCVT = getSetCCResultType(LHS.getValueType());
+ unsigned SetCCWidth = SetCCVT.getScalarSizeInBits();
+ unsigned WideWidth = WideVT.getScalarSizeInBits();
+ bool IsSigned = isSignedIntSetCC(CC);
+ auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+ if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() &&
+ SetCCWidth != 1 && SetCCWidth < WideWidth &&
+ TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) &&
+ TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) {
+ // Both compare operands can be widened for free. The LHS can use an
+ // extended load, and the RHS is a constant:
+ // vselect (ext (setcc load(X), C)), N1, N2 -->
+ // vselect (setcc extload(X), C'), N1, N2
+ auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+ SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS);
+ SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS);
+ EVT WideSetCCVT = getSetCCResultType(WideVT);
+ SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC);
+ return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
+ }
+ }
}
if (SimplifySelectOps(N, N1, N2))
Modified: llvm/trunk/test/CodeGen/X86/vsel-cmp-load.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vsel-cmp-load.ll?rev=334378&r1=334377&r2=334378&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vsel-cmp-load.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vsel-cmp-load.ll Sun Jun 10 16:09:50 2018
@@ -21,11 +21,9 @@ define <8 x i32> @eq_zero(<8 x i8>* %p,
;
; AVX2-LABEL: eq_zero:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpslld $24, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -58,12 +56,11 @@ define <4 x i64> @ne_zero(<4 x i16>* %p,
;
; AVX2-LABEL: ne_zero:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -96,10 +93,9 @@ define <16 x i16> @sgt_zero(<16 x i8>* %
;
; AVX2-LABEL: sgt_zero:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm2
+; AVX2-NEXT: vpmovsxbw (%rdi), %ymm2
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2
+; AVX2-NEXT: vpcmpgtw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -119,23 +115,21 @@ define <8 x i32> @slt_zero(<8 x i8>* %p,
; AVX1-LABEL: slt_zero:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbw (%rdi), %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX1-NEXT: vpslld $24, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX1-NEXT: vpslld $24, %xmm2, %xmm2
+; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: slt_zero:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbw (%rdi), %xmm2
+; AVX2-NEXT: vpmovsxbd (%rdi), %ymm2
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpslld $24, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -166,10 +160,9 @@ define <4 x double> @eq_zero_fp_select(<
;
; AVX2-LABEL: eq_zero_fp_select:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vpcmpeqq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -203,13 +196,11 @@ define <8 x float> @ne_zero_fp_select(<8
;
; AVX2-LABEL: ne_zero_fp_select:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpeqw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpslld $24, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -229,21 +220,21 @@ define <4 x double> @sgt_zero_fp_select(
; AVX1-LABEL: sgt_zero_fp_select:
; AVX1: # %bb.0:
; AVX1-NEXT: vpmovsxbd (%rdi), %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sgt_zero_fp_select:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbd (%rdi), %xmm2
+; AVX2-NEXT: vpmovsxbq (%rdi), %ymm2
; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -260,6 +251,8 @@ define <4 x double> @sgt_zero_fp_select(
ret <4 x double> %sel
}
+; FIXME: The compare with 0 for AVX2 should be eliminated.
+
define <8 x float> @slt_zero_fp_select(<8 x i16>* %p, <8 x float> %x, <8 x float> %y) {
; AVX1-LABEL: slt_zero_fp_select:
; AVX1: # %bb.0:
@@ -272,6 +265,8 @@ define <8 x float> @slt_zero_fp_select(<
; AVX2-LABEL: slt_zero_fp_select:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwd (%rdi), %ymm2
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
More information about the llvm-commits
mailing list