[PATCH] Teach the DAGCombiner how to fold vselect nodes with Mask of AllZeros/AllOnes.

Wed Jan 8 03:52:42 PST 2014

Hi,

This patch teaches the DAGCombiner how to fold 'vselect' dag nodes
according to the following two rules:
  1)  fold (vselect (build_vector AllOnes), A, B) -> A
  2)  fold (vselect (build_vector AllZeros), A, B) -> B

These rules are useful for example when the vselect Mask contains Undef values.
It also improves the case where the entire Mask is Undef.

I added more test-cases to file test/CodeGen/X86/vselect.ll

The following tests have been modified since now we know how to
simplify vselect when the mask is Undef.:
  - test/CodeGen/X86/2011-10-19-widen_vselect.ll
  - test/CodeGen/X86/avx512-vselect-crash.ll

I also modified the test  Aarch64/neon-bitwise-instructions.ll for this reason:
before my fix, the following dag sequence

     (or (and A, (build_vector AllOnes)), (and B (build_vector AllZeros)))

was combined into a (vselect (build_vector AllOnes), A, B).
With my patch, the resulting vselect is folded into A and therefore
the backend no longer selects a 'bsl' for it.

I changed the test so that the resulting vselect obtained when
combining the or dag node above is not "trivially foldable" into one
of its operands (i.e. I modified the test so that the resulting
vselect does not have a Mask of AllOnes).
With that change, the test passes again and it still produces the
expected results.

Please let me know if ok to submit.

Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment Group
-------------- next part --------------
Index: test/CodeGen/X86/vselect.ll
===================================================================

--- test/CodeGen/X86/vselect.ll	(revision 198746)
+++ test/CodeGen/X86/vselect.ll	(working copy)
@@ -130,4 +130,47 @@
 ; CHECK-NOT: psraw
 ; CHECK: ret
 
+; Fold (vselect (build_vector AllOnes), N1, N2) -> N1
 
+define <4 x float> @test14(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 undef, i1 true, i1 undef>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: pcmpeq
+; CHECK: ret
+
+define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: pcmpeq
+; CHECK: ret
+
+; Fold (vselect (build_vector AllZeros), N1, N2) -> N2
+
+define <4 x float> @test16(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 undef, i1 false, i1 undef>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+} 
+; CHECK-LABEL: test16
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: ret 
+
+define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test17
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: ret
+
Index: test/CodeGen/X86/avx512-vselect-crash.ll
===================================================================
--- test/CodeGen/X86/avx512-vselect-crash.ll	(revision 198746)
+++ test/CodeGen/X86/avx512-vselect-crash.ll	(working copy)
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
 ; CHECK-LABEL: test
-; CHECK: vmovdqu32
+; CHECK: vpxord
 ; CHECK: ret
 define <16 x i32> @test() {
 entry:
Index: test/CodeGen/X86/2011-10-19-widen_vselect.ll
===================================================================
--- test/CodeGen/X86/2011-10-19-widen_vselect.ll	(revision 198746)
+++ test/CodeGen/X86/2011-10-19-widen_vselect.ll	(working copy)
@@ -1,12 +1,10 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
 
-target triple = "x86_64-unknown-linux-gnu"
-
-; Make sure that we don't crash when legalizng vselect and vsetcc and that
+; Make sure that we don't crash when legalizing vselect and vsetcc and that
 ; we are able to generate vector blend instructions.
 
-; CHECK: simple_widen
-; CHECK: blend
+; CHECK-LABEL: simple_widen
+; CHECK-NOT: blend
 ; CHECK: ret
 define void @simple_widen() {
 entry:
@@ -15,7 +13,7 @@
   ret void
 }
 
-; CHECK: complex_inreg_work
+; CHECK-LABEL: complex_inreg_work
 ; CHECK: blend
 ; CHECK: ret
 
@@ -27,8 +25,8 @@
   ret void
 }
 
-; CHECK: zero_test
-; CHECK: blend
+; CHECK-LABEL: zero_test
+; CHECK: xorps	%xmm0, %xmm0
 ; CHECK: ret
 
 define void @zero_test() {
@@ -38,7 +36,7 @@
   ret void
 }
 
-; CHECK: full_test
+; CHECK-LABEL: full_test
 ; CHECK: blend
 ; CHECK: ret
 
Index: test/CodeGen/AArch64/neon-bitwise-instructions.ll
===================================================================
--- test/CodeGen/AArch64/neon-bitwise-instructions.ll	(revision 198746)
+++ test/CodeGen/AArch64/neon-bitwise-instructions.ll	(working copy)
@@ -40,16 +40,16 @@
 
 define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b)  {
 ;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
-	%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+	%tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0 >
+	%tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 -1, i8 -1, i8 0, i8 0, i8 -1, i8 -1 >
 	%tmp3 = or <8 x i8> %tmp1, %tmp2
 	ret <8 x i8> %tmp3
 }
 
 define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
-;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
-	%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+;CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0 >
+	%tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 -1 >
 	%tmp3 = or <16 x i8> %tmp1, %tmp2
 	ret <16 x i8> %tmp3
 }
@@ -444,10 +444,11 @@
   %tmp2 = or <2 x i64> %a, %tmp1
   ret <2 x i64> %tmp2
 }
+
 define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b)  {
 ;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp1 = and <2 x i32> %a, < i32 -1, i32 -1 >
-	%tmp2 = and <2 x i32> %b, < i32 0, i32 0 >
+	%tmp1 = and <2 x i32> %a, < i32 -1, i32 0 >
+	%tmp2 = and <2 x i32> %b, < i32 0, i32 -1 >
 	%tmp3 = or <2 x i32> %tmp1, %tmp2
 	ret <2 x i32> %tmp3
 }
@@ -455,40 +456,40 @@
 
 define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b)  {
 ;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp1 = and <4 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1 >
-	%tmp2 = and <4 x i16> %b, < i16 0, i16 0,i16 0, i16 0 >
+	%tmp1 = and <4 x i16> %a, < i16 -1, i16 0, i16 -1,i16 0 >
+	%tmp2 = and <4 x i16> %b, < i16 0, i16 -1,i16 0, i16 -1 >
 	%tmp3 = or <4 x i16> %tmp1, %tmp2
 	ret <4 x i16> %tmp3
 }
 
 define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b)  {
 ;CHECK:  bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-	%tmp1 = and <1 x i64> %a, < i64 -1 >
-	%tmp2 = and <1 x i64> %b, < i64 0 >
+	%tmp1 = and <1 x i64> %a, < i64 -16 >
+	%tmp2 = and <1 x i64> %b, < i64 15 >
 	%tmp3 = or <1 x i64> %tmp1, %tmp2
 	ret <1 x i64> %tmp3
 }
 
 define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b)  {
 ;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp1 = and <4 x i32> %a, < i32 -1, i32 -1, i32 -1, i32 -1 >
-	%tmp2 = and <4 x i32> %b, < i32 0, i32 0, i32 0, i32 0 >
+	%tmp1 = and <4 x i32> %a, < i32 -1, i32 0, i32 -1, i32 0 >
+	%tmp2 = and <4 x i32> %b, < i32 0, i32 -1, i32 0, i32 -1 >
 	%tmp3 = or <4 x i32> %tmp1, %tmp2
 	ret <4 x i32> %tmp3
 }
 
 define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b)  {
 ;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1, i16 -1, i16 -1, i16 -1,i16 -1 >
-	%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0 >
+	%tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 0,i16 0, i16 -1, i16 -1, i16 0,i16 0 >
+	%tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 -1, i16 -1, i16 0, i16 0, i16 -1, i16 -1 >
 	%tmp3 = or <8 x i16> %tmp1, %tmp2
 	ret <8 x i16> %tmp3
 }
 
 define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b)  {
 ;CHECK:  bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-	%tmp1 = and <2 x i64> %a, < i64 -1, i64 -1 >
-	%tmp2 = and <2 x i64> %b, < i64 0, i64 0 >
+	%tmp1 = and <2 x i64> %a, < i64 -1, i64 0 >
+	%tmp2 = and <2 x i64> %b, < i64 0, i64 -1 >
 	%tmp3 = or <2 x i64> %tmp1, %tmp2
 	ret <2 x i64> %tmp3
 }
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(revision 198746)
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(working copy)
@@ -4398,6 +4398,13 @@
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
   }
 
+  // Fold (vselect (build_vector all_ones), N1, N2) -> N1
+  if (ISD::isBuildVectorAllOnes(N0.getNode()))
+    return N1;
+  // Fold (vselect (build_vector all_zeros), N1, N2) -> N2
+  if (ISD::isBuildVectorAllZeros(N0.getNode()))
+    return N2;
+
   return SDValue();
 }