[PATCH][x86] Teach how to combine a vselect into a movss/movsd.
Andrea Di Biagio
andrea.dibiagio at gmail.com
Thu Jan 16 05:42:29 PST 2014
Hi,
this patch teaches the x86 backend how to combine vselect dag nodes
into movss/movsd when possible.
If the vector type of the operands of the vselect is either
MVT::v4i32 or MVT::v4f32, then we can fold according to the following rules:
1. fold (vselect (build_vector (0, -1, -1, -1)), A, B) -> (movss A, B);
2. fold (vselect (build_vector (-1, 0, 0, 0)), A, B) -> (movss B, A)
If the vector type of the operands of the vselect is either
MVT::v2i64 or MVT::v2f64 (and we have SSE2) , then we can fold
according to the following rules:
3. fold (vselect (build_vector (0, -1)), A, B) -> (movsd A, B)
4. fold (vselect (build_vector (-1, 0)), A, B) -> (movsd B, A)
I added extra test cases to file 'test/CodeGen/X86/vselect.ll' in
order to verify that we correctly select movss/movsd instructions.
Before this change, the backend only knew how to lower a shufflevector
into a X86Movss/X86Movsd, but not how to do the same with vselect dag
nodes.
For that reason, all the ISel patterns introduced at r197145
http://llvm.org/viewvc/llvm-project?view=revision&revision=197145
were only matched if the X86Movss/X86Movsd were obtained from the
custom lowering of a shufflevector.
With this change, the backend is now able to combine vselect into
X86Movss and therefore it can reuse the patterns from revision 197145
to further simplify packed vector arithmetic operations.
I added new test-cases in 'test/CodeGen/X86/sse-scalar-fp-arith-2.ll'
to verify that now we correctly select SSE/AVX scalar fp instructions
from a packed arithmetic instruction followed by a vselect.
After this change, the following tests started failing because they
always expected blendvps/blendvpd instructions in the output assembly:
test/CodeGen/X86/sse2-blend.ll
test/CodeGen/X86/avx-blend.ll
test/CodeGen/X86/blend-msb.ll
test/CodeGen/X86/sse41-blend.ll
Now the backend knows how to efficiently emit movss/movsd and
therefore all the failing cases are expected failures (that is because
the backend knows how to select movss/movsd and not only
blendvps/blendvpd).
I modified those failing tests so that - when possible - the generated
assembly still contains the expected blendvps/blendvpd(see for example
how I changed avx-blend.ll).
In all other cases I just changed the CHECK lines to verify that we
produce a movss/movsd.
Please let me know if ok to submit.
Thanks,
Andrea Di Biagio
SN Systems - Sony Computer Entertainment Group.
-------------- next part --------------
Index: test/CodeGen/X86/sse-scalar-fp-arith-2.ll
===================================================================
--- test/CodeGen/X86/sse-scalar-fp-arith-2.ll (revision 199379)
+++ test/CodeGen/X86/sse-scalar-fp-arith-2.ll (working copy)
@@ -213,3 +213,211 @@
; CHECK-NOT: movsd
; CHECK: ret
+
+define <4 x float> @test3_add_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = fadd <4 x float> %a, %b
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+ ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_add_ss
+; SSE2: addss %xmm1, %xmm0
+; AVX: vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test3_sub_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = fsub <4 x float> %a, %b
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+ ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_sub_ss
+; SSE2: subss %xmm1, %xmm0
+; AVX: vsubss %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test3_mul_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = fmul <4 x float> %a, %b
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+ ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_mul_ss
+; SSE2: mulss %xmm1, %xmm0
+; AVX: vmulss %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test3_div_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = fdiv <4 x float> %a, %b
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+ ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_div_ss
+; SSE2: divss %xmm1, %xmm0
+; AVX: vdivss %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test3_add_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = fadd <2 x double> %a, %b
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+ ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_add_sd
+; SSE2: addsd %xmm1, %xmm0
+; AVX: vaddsd %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test3_sub_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = fsub <2 x double> %a, %b
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+ ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_sub_sd
+; SSE2: subsd %xmm1, %xmm0
+; AVX: vsubsd %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test3_mul_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = fmul <2 x double> %a, %b
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+ ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_mul_sd
+; SSE2: mulsd %xmm1, %xmm0
+; AVX: vmulsd %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test3_div_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = fdiv <2 x double> %a, %b
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+ ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_div_sd
+; SSE2: divsd %xmm1, %xmm0
+; AVX: vdivsd %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test4_add_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = fadd <4 x float> %b, %a
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+ ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_add_ss
+; SSE2: addss %xmm0, %xmm1
+; AVX: vaddss %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test4_sub_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = fsub <4 x float> %b, %a
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+ ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_sub_ss
+; SSE2: subss %xmm0, %xmm1
+; AVX: vsubss %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test4_mul_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = fmul <4 x float> %b, %a
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+ ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_mul_ss
+; SSE2: mulss %xmm0, %xmm1
+; AVX: vmulss %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test4_div_ss(<4 x float> %a, <4 x float> %b) {
+ %1 = fdiv <4 x float> %b, %a
+ %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+ ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_div_ss
+; SSE2: divss %xmm0, %xmm1
+; AVX: vdivss %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test4_add_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = fadd <2 x double> %b, %a
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+ ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_add_sd
+; SSE2: addsd %xmm0, %xmm1
+; AVX: vaddsd %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test4_sub_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = fsub <2 x double> %b, %a
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+ ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_sub_sd
+; SSE2: subsd %xmm0, %xmm1
+; AVX: vsubsd %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test4_mul_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = fmul <2 x double> %b, %a
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+ ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_mul_sd
+; SSE2: mulsd %xmm0, %xmm1
+; AVX: vmulsd %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test4_div_sd(<2 x double> %a, <2 x double> %b) {
+ %1 = fdiv <2 x double> %b, %a
+ %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+ ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_div_sd
+; SSE2: divsd %xmm0, %xmm1
+; AVX: vdivsd %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
Index: test/CodeGen/X86/vselect.ll
===================================================================
--- test/CodeGen/X86/vselect.ll (revision 199379)
+++ test/CodeGen/X86/vselect.ll (working copy)
@@ -174,3 +174,91 @@
; CHECK-NOT: xorps
; CHECK: ret
+define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
+ %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+ ret <4 x float> %1
+}
+; CHECK-LABEL: test18
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+ %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test19
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
+ %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
+ ret <2 x double> %1
+}
+; CHECK-LABEL: test20
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
+define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+ %1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b
+ ret <2 x i64> %1
+}
+; CHECK-LABEL: test21
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
+ %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+ ret <4 x float> %1
+}
+; CHECK-LABEL: test22
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
+ %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
+ ret <4 x i32> %1
+}
+; CHECK-LABEL: test23
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
+ %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
+ ret <2 x double> %1
+}
+; CHECK-LABEL: test24
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
+define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
+ %1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
+ ret <2 x i64> %1
+}
+; CHECK-LABEL: test25
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
Index: test/CodeGen/X86/avx-blend.ll
===================================================================
--- test/CodeGen/X86/avx-blend.ll (revision 199379)
+++ test/CodeGen/X86/avx-blend.ll (working copy)
@@ -6,7 +6,7 @@
;CHECK: vblendvps
;CHECK: ret
define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+ %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
ret <4 x float> %vsel
}
@@ -15,13 +15,13 @@
;CHECK: vblendvps
;CHECK: ret
define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+ %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
ret <4 x i32> %vsel
}
;CHECK-LABEL: vsel_double:
-;CHECK: vblendvpd
+;CHECK: vmovsd
;CHECK: ret
define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
%vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
@@ -30,7 +30,7 @@
;CHECK-LABEL: vsel_i64:
-;CHECK: vblendvpd
+;CHECK: vmovsd
;CHECK: ret
define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
%vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
Index: test/CodeGen/X86/sse2-blend.ll
===================================================================
--- test/CodeGen/X86/sse2-blend.ll (revision 199379)
+++ test/CodeGen/X86/sse2-blend.ll (working copy)
@@ -1,9 +1,9 @@
; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
-; CHECK: vsel_float
-; CHECK: xorps
+; CHECK-LABEL: vsel_float
+; CHECK-NOT: xorps
; CHECK: movss
-; CHECK: orps
+; CHECK-NOT: orps
; CHECK: ret
define void at vsel_float(<4 x float>* %v1, <4 x float>* %v2) {
%A = load <4 x float>* %v1
@@ -13,10 +13,10 @@
ret void
}
-; CHECK: vsel_i32
-; CHECK: xorps
+; CHECK-LABEL: vsel_i32
+; CHECK-NOT: xorps
; CHECK: movss
-; CHECK: orps
+; CHECK-NOT: orps
; CHECK: ret
define void at vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
%A = load <4 x i32>* %v1
@@ -26,10 +26,9 @@
ret void
}
-; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK: vsel_i64
-; CHECK: andnps
-; CHECK: orps
+; CHECK-LABEL: vsel_i64
+; CHECK: movsd
+; CHECK-NOT: orps
; CHECK: ret
define void at vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) {
@@ -40,10 +39,9 @@
ret void
}
-; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK: vsel_double
-; CHECK: andnps
-; CHECK: orps
+; CHECK-LABEL: vsel_double
+; CHECK: movsd
+; CHECK-NOT: orps
; CHECK: ret
define void at vsel_double(<2 x double>* %v1, <2 x double>* %v2) {
Index: test/CodeGen/X86/sse41-blend.ll
===================================================================
--- test/CodeGen/X86/sse41-blend.ll (revision 199379)
+++ test/CodeGen/X86/sse41-blend.ll (working copy)
@@ -4,7 +4,7 @@
;CHECK: blendvps
;CHECK: ret
define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+ %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
ret <4 x float> %vsel
}
@@ -13,7 +13,7 @@
;CHECK: blendvps
;CHECK: ret
define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
+ %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
ret <4 x i8> %vsel
}
@@ -21,7 +21,7 @@
;CHECK: blendvps
;CHECK: ret
define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i16> %v1, <4 x i16> %v2
+ %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
ret <4 x i16> %vsel
}
@@ -30,13 +30,13 @@
;CHECK: blendvps
;CHECK: ret
define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
- %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+ %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> %v1, <4 x i32> %v2
ret <4 x i32> %vsel
}
;CHECK-LABEL: vsel_double:
-;CHECK: blendvpd
+;CHECK: movsd
;CHECK: ret
define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) {
%vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %v1, <4 x double> %v2
@@ -45,7 +45,7 @@
;CHECK-LABEL: vsel_i64:
-;CHECK: blendvpd
+;CHECK: movsd
;CHECK: ret
define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) {
%vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v1, <4 x i64> %v2
Index: test/CodeGen/X86/blend-msb.ll
===================================================================
--- test/CodeGen/X86/blend-msb.ll (revision 199379)
+++ test/CodeGen/X86/blend-msb.ll (working copy)
@@ -1,13 +1,11 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
-; In this test we check that sign-extend of the mask bit is performed by
-; shifting the needed bit to the MSB, and not using shl+sra.
+; Verify that we produce movss instead of blendvps when possible.
;CHECK-LABEL: vsel_float:
-;CHECK: movl $-1
-;CHECK-NEXT: movd
-;CHECK-NEXT: blendvps
+;CHECK-NOT: blendvps
+;CHECK: movss
;CHECK: ret
define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
%vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -15,9 +13,8 @@
}
;CHECK-LABEL: vsel_4xi8:
-;CHECK: movl $-1
-;CHECK-NEXT: movd
-;CHECK-NEXT: blendvps
+;CHECK-NOT: blendvps
+;CHECK: movss
;CHECK: ret
define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
%vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp (revision 199379)
+++ lib/Target/X86/X86ISelLowering.cpp (working copy)
@@ -17155,6 +17155,53 @@
}
}
+ // Try to canonicalize this VSELECT into a MOVSS/MOVSD.
+ if (N->getOpcode() == ISD::VSELECT &&
+ Cond.getOpcode() == ISD::BUILD_VECTOR) {
+
+ if ((VT == MVT::v4i32 || VT == MVT::v4f32) ||
+ (Subtarget->hasSSE2() && (VT == MVT::v2i64 || VT == MVT::v2f64))) {
+ bool CanConvertToMOVSS = false;
+ bool InvertOperands = false;
+
+ // fold (vselect <0,-1,-1,-1>, A, B) -> (movss A, B)
+ // fold (vselect <0,-1>, A, B) -> (movsd A, B)
+ if (isZero(Cond.getOperand(0))) {
+ unsigned NumElems = VT.getVectorNumElements();
+ CanConvertToMOVSS = true;
+ for (unsigned i = 1, e = NumElems; i != e; ++i) {
+ SDValue Op = Cond.getOperand(i);
+ if (!isAllOnes(Op)) {
+ CanConvertToMOVSS = false;
+ break;
+ }
+ }
+ } else if (isAllOnes(Cond.getOperand(0))) {
+ // fold (vselect <-1,0,0,0>, A, B) -> (movss B, A)
+ // fold (vselect <-1,0>, A, B) -> (movsd B, A)
+ unsigned NumElems = VT.getVectorNumElements();
+ CanConvertToMOVSS = true;
+ InvertOperands = true;
+ for (unsigned i = 1, e = NumElems; i != e; ++i) {
+ SDValue Op = Cond.getOperand(i);
+ if (!isZero(Op)) {
+ CanConvertToMOVSS = false;
+ break;
+ }
+ }
+ }
+
+ if (CanConvertToMOVSS) {
+ SDValue NewLHS = InvertOperands ? RHS : LHS;
+ SDValue NewRHS = InvertOperands ? LHS : RHS;
+ if (VT == MVT::v4i32 || VT == MVT::v4f32)
+ return getTargetShuffleNode(X86ISD::MOVSS, DL, VT,
+ NewLHS, NewRHS, DAG);
+ return getTargetShuffleNode(X86ISD::MOVSD, DL, VT, NewLHS, NewRHS, DAG);
+ }
+ }
+ }
+
// If we know that this node is legal then we know that it is going to be
// matched by one of the SSE/AVX BLEND instructions. These instructions only
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
More information about the llvm-commits
mailing list