[PATCH] [X86, AVX] adjust tablegen patterns to generate better code for scalar insertion into zero vector (PR23073)

Wed Apr 1 17:52:42 PDT 2015

Hi ab, qcolombet, craig.topper, andreadb,

As noted in PR23073 ( https://llvm.org/bugs/show_bug.cgi?id=23073 ),
for code like this:
  define <8 x i32> @load_v8i32() {
    ret <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
  }

We produce this AVX code:
  _load_v8i32:                            ## @load_v8i32
	movl	$7, %eax
	vmovd	%eax, %xmm0
	vxorps	%ymm1, %ymm1, %ymm1
	vblendps	$1, %ymm0, %ymm1, %ymm0 ## ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
	retq


There are at least 2 bugs in play here:
1. We're generating a blend when a move scalar does the same job using 2 less instruction bytes.
2. We're not matching an existing pattern that would eliminate the xor and blend entirely. The zero bytes are free with vmovd.

The 2nd fix involves an adjustment of "AddedComplexity" [1] and masks the 1st problem, but I went ahead with a partial fix for that problem in case we ever match that pattern. I'm not sure how to do it, so I don't have an additional test case for it. I'll address the remaining FIXMEs if nobody sees any problems with this patch.

[1] AddedComplexity has close to no documentation in the source. The best we have is this comment: "roughly corresponds to the number of nodes that are covered". It appears that x86 has bastardized this definition by inflating its values for some other undocumented reason. For example, we have a pattern with "AddedComplexity = 400" (!). I searched my way to this page:
https://groups.google.com/forum/#!topic/llvm-dev/5UX-Og9M0xQ

http://reviews.llvm.org/D8794

Files:
  lib/Target/X86/X86InstrSSE.td
  test/CodeGen/X86/vector-shuffle-256-v4.ll
  test/CodeGen/X86/vector-shuffle-256-v8.ll

Index: lib/Target/X86/X86InstrSSE.td
===================================================================

--- lib/Target/X86/X86InstrSSE.td
+++ lib/Target/X86/X86InstrSSE.td
@@ -7174,6 +7174,7 @@
   // MOVS{S,D} to the lower bits.
   def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
             (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
+  // FIXME: Prefer a movss (smaller encoding) over a blendps.
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
             (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
   def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
@@ -7183,9 +7184,13 @@
 
   // Move low f32 and clear high bits.
   def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
-            (VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;
-  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
-            (VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
+            (SUBREG_TO_REG (i32 0), (VMOVSSrr (v4f32 (V_SET0)),
+              (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
+            (SUBREG_TO_REG (i64 0), (VMOVSDrr (v2f64 (V_SET0)),
+              (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;
   }
 
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
@@ -7199,14 +7204,19 @@
                            (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
                            sub_xmm)>;
 
-  // Move low f64 and clear high bits.
-  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
-            (VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
-
+  // These will incur an FP/int domain crossing penalty, but it may be the only
+  // way without AVX2. Do not add any complexity because we may be able to match
+  // more optimal patterns defined earlier in this file.
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSrr (v4f32 (V_SET0)),
+              (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
-            (VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
+            (SUBREG_TO_REG (i64 0), (VMOVSDrr (v2f64 (V_SET0)),
+              (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;
 }
 
+// FIXME: Prefer a movss over a blendps and a movsd over a blendpd
+// (smaller encodings).
 let Predicates = [UseSSE41] in {
   // With SSE41 we can use blends for these patterns.
   def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
Index: test/CodeGen/X86/vector-shuffle-256-v4.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -843,7 +843,7 @@
 define <4 x double> @insert_reg_and_zero_v4f64(double %a) {
 ; ALL-LABEL: insert_reg_and_zero_v4f64:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; ALL:         vxorpd %xmm1, %xmm1, %xmm1
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
 ; ALL-NEXT:    retq
   %v = insertelement <4 x double> undef, double %a, i32 0
Index: test/CodeGen/X86/vector-shuffle-256-v8.ll
===================================================================
--- test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -133,8 +133,6 @@
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movl $7, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vxorps %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -962,8 +960,6 @@
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    movl $7, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vxorps %ymm2, %ymm2, %ymm2
-; AVX2-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>

EMAIL PREFERENCES
  http://reviews.llvm.org/settings/panel/emailpreferences/
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D8794.23112.patch
Type: text/x-patch
Size: 4162 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20150402/d0feb602/attachment.bin>