[PATCH][x86] Add more rules for combining vselect dag nodes.
Andrea Di Biagio
andrea.dibiagio at gmail.com
Tue Jan 21 14:38:58 PST 2014
Hi,
This patch adds extra rules for combining vselect dag nodes into movsd.
This improves the fix committed at revision r199683 adding the
following new target specific combine rules:
1) fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
(v4i32 (bitcast (movsd (v2i64 (bitcast A)), (v2i64 (bitcast B))) ))
2) fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
(v4f32 (bitcast (movsd (v2f64 (bitcast A)), (v2f64 (bitcast B))) ))
3) fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
(v4i32 (bitcast (movsd (v2i64 (bitcast B)), (v2i64 (bitcast A))) ))
4) fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
(v4f32 (bitcast (movsd (v2f64 (bitcast B)), (v2f64 (bitcast A))) ))
Example:
//////
define <4 x i32> @test(<4 x i32> %A, <4 x i32> %B) {
%select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4
x i32> %A, <4 x i32> %B
ret <4 x i32> %select
}
//////
(with -mattr=sse2 and -march=x86-64)
Before this change, the compiler produced the following assembly:
movaps %xmm0, %xmm2
movaps .LCPI1_0(%rip), %xmm0
blendvps %xmm2, %xmm1
movaps %xmm1, %xmm0
retq
Now it produces:
movsd %xmm1, %xmm0
retq
Basically with this patch we generate a single movsd instead of a
blendvps which requires a load from constant pool to properly set the
selection Mask.
Please let me know if ok to submit.
Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment Group.
-------------- next part --------------
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp (revision 199766)
+++ lib/Target/X86/X86ISelLowering.cpp (working copy)
@@ -17187,6 +17187,45 @@
return getTargetShuffleNode(X86ISD::MOVSS, DL, VT, A, B, DAG);
return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, A, B, DAG);
}
+ if (Subtarget->hasSSE2() && (VT == MVT::v4i32 || VT == MVT::v4f32)) {
+ // fold (v4i32: vselect <0,0,-1,-1>, A, B) ->
+ // (v4i32 (bitcast (movsd (v2i64 (bitcast A)),
+ // (v2i64 (bitcast B)))))
+ //
+ // fold (v4f32: vselect <0,0,-1,-1>, A, B) ->
+ // (v4f32 (bitcast (movsd (v2f64 (bitcast A)),
+ // (v2f64 (bitcast B)))))
+ //
+ // fold (v4i32: vselect <-1,-1,0,0>, A, B) ->
+ // (v4i32 (bitcast (movsd (v2i64 (bitcast B)),
+ // (v2i64 (bitcast A)))))
+ //
+ // fold (v4f32: vselect <-1,-1,0,0>, A, B) ->
+ // (v4f32 (bitcast (movsd (v2f64 (bitcast B)),
+ // (v2f64 (bitcast A)))))
+
+ CanFold = (isZero(Cond.getOperand(0)) &&
+ isZero(Cond.getOperand(1)) &&
+ isAllOnes(Cond.getOperand(2)) &&
+ isAllOnes(Cond.getOperand(3)));
+
+ if (!CanFold && isAllOnes(Cond.getOperand(0)) &&
+ isAllOnes(Cond.getOperand(1)) &&
+ isZero(Cond.getOperand(2)) &&
+ isZero(Cond.getOperand(3))) {
+ CanFold = true;
+ std::swap(LHS, RHS);
+ }
+
+ if (CanFold) {
+ EVT NVT = (VT == MVT::v4i32) ? MVT::v2i64 : MVT::v2f64;
+ SDValue NewA = DAG.getNode(ISD::BITCAST, DL, NVT, LHS);
+ SDValue NewB = DAG.getNode(ISD::BITCAST, DL, NVT, RHS);
+ SDValue Select = getTargetShuffleNode(X86ISD::MOVSD, DL, NVT, NewA,
+ NewB, DAG);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Select);
+ }
+ }
}
}
Index: test/CodeGen/X86/sse41-blend.ll
===================================================================
--- test/CodeGen/X86/sse41-blend.ll (revision 199766)
+++ test/CodeGen/X86/sse41-blend.ll (working copy)
@@ -13,7 +13,7 @@
;CHECK: blendvps
;CHECK: ret
define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
+ %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
ret <4 x i8> %vsel
}
@@ -30,7 +30,7 @@
;CHECK: blendvps
;CHECK: ret
define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+ %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
ret <4 x i32> %vsel
}
Index: test/CodeGen/X86/vselect-2.ll
===================================================================
--- test/CodeGen/X86/vselect-2.ll (revision 0)
+++ test/CodeGen/X86/vselect-2.ll (revision 0)
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=sse2 | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
+ %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
+ ret <4 x i32> %select
+}
+; CHECK-LABEL: test1
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
+ %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x i32> %A, <4 x i32> %B
+ ret <4 x i32> %select
+}
+; CHECK-LABEL: test2
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
+ %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
+ ret <4 x float> %select
+}
+; CHECK-LABEL: test3
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
+ %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
+ ret <4 x float> %select
+}
+; CHECK-LABEL: test4
+; CHECK: movsd
+; CHECK-NEXT: ret
+
More information about the llvm-commits
mailing list