[PATCH][x86] Add more rules for combining vselect dag nodes.

Tue Jan 21 14:38:58 PST 2014

Hi,

This patch adds extra rules for combining vselect dag nodes into movsd.
This improves the fix committed at revision r199683 adding the
following new target specific combine rules:

1)   fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
            (v4i32 (bitcast (movsd (v2i64 (bitcast A)), (v2i64 (bitcast B))) ))

2)   fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
            (v4f32 (bitcast (movsd (v2f64 (bitcast A)), (v2f64 (bitcast B))) ))

3)   fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
            (v4i32 (bitcast (movsd (v2i64 (bitcast B)), (v2i64 (bitcast A))) ))

4)   fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
            (v4f32 (bitcast (movsd (v2f64 (bitcast B)), (v2f64 (bitcast A))) ))

Example:

//////
define <4 x i32> @test(<4 x i32> %A, <4 x i32> %B) {
  %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4
x i32> %A, <4 x i32> %B
  ret <4 x i32> %select
}
//////

(with -mattr=sse2 and -march=x86-64)
Before this change, the compiler produced the following assembly:
        movaps  %xmm0, %xmm2
        movaps  .LCPI1_0(%rip), %xmm0
        blendvps        %xmm2, %xmm1
        movaps  %xmm1, %xmm0
        retq

Now it produces:
        movsd   %xmm1, %xmm0
        retq

Basically with this patch we generate a single movsd instead of a
blendvps which requires a load from constant pool to properly set the
selection Mask.

Please let me know if ok to submit.

Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment Group.
-------------- next part --------------
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================

--- lib/Target/X86/X86ISelLowering.cpp	(revision 199766)
+++ lib/Target/X86/X86ISelLowering.cpp	(working copy)
@@ -17187,6 +17187,45 @@
           return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
         return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
       }
+      if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
+        // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
+        //      (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
+        //                             (v2i64 (bitcast B)))))
+        //
+        // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
+        //      (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
+        //                             (v2f64 (bitcast B)))))
+        //
+        // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
+        //      (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
+        //                             (v2i64 (bitcast A)))))
+        //
+        // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
+        //      (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
+        //                             (v2f64 (bitcast A)))))
+
+        CanFold = (isZero(Cond.getOperand(0)) &&
+                   isZero(Cond.getOperand(1)) &&
+                   isAllOnes(Cond.getOperand(2)) &&
+                   isAllOnes(Cond.getOperand(3)));
+
+        if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
+            isAllOnes(Cond.getOperand(1)) &&
+            isZero(Cond.getOperand(2)) &&
+            isZero(Cond.getOperand(3))) {
+          CanFold = true;
+          std::swap(LHS, RHS);
+        }
+
+        if (CanFold) {
+          EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
+          SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
+          SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
+          SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
+                                                NewB, DAG);
+          return DAG.getNode(ISD::BITCAST, DL, VT, Select);
+        }
+      }
     }
   }
 
Index: test/CodeGen/X86/sse41-blend.ll
===================================================================
--- test/CodeGen/X86/sse41-blend.ll	(revision 199766)
+++ test/CodeGen/X86/sse41-blend.ll	(working copy)
@@ -13,7 +13,7 @@
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
+  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
   ret <4 x i8> %vsel
 }
 
@@ -30,7 +30,7 @@
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
   ret <4 x i32> %vsel
 }
 
Index: test/CodeGen/X86/vselect-2.ll
===================================================================
--- test/CodeGen/X86/vselect-2.ll	(revision 0)
+++ test/CodeGen/X86/vselect-2.ll	(revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=sse2 | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
+  %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
+  ret <4 x i32> %select
+}
+; CHECK-LABEL: test1
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
+  %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x i32> %A, <4 x i32> %B
+  ret <4 x i32> %select
+}
+; CHECK-LABEL: test2
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
+  %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
+  ret <4 x float> %select
+}
+; CHECK-LABEL: test3
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
+  %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
+  ret <4 x float> %select
+}
+; CHECK-LABEL: test4
+; CHECK: movsd
+; CHECK-NEXT: ret
+