[llvm] r204631 - [X86][ISelDAG] Add missing fallback patterns for avx2 broadcast instructions.

Mon Mar 24 10:54:20 PDT 2014

Author: qcolombet
Date: Mon Mar 24 12:54:19 2014
New Revision: 204631

URL: http://llvm.org/viewvc/llvm-project?rev=204631&view=rev
Log:
[X86][ISelDAG] Add missing fallback patterns for avx2 broadcast instructions.
Those patterns are used when the load cannot be folded into the related broadcast
during the select phase.
This happens when the load gets additional uses that were not anticipated during
the previous lowering phases (constant vector to constant load, then constant
load reused) or when selection DAG is not able to prove that folding the load
will not create a cycle in the DAG.

<rdar://problem/16074331>

Modified:
    llvm/trunk/lib/Target/X86/X86InstrSSE.td
    llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll

Modified: llvm/trunk/lib/Target/X86/X86InstrSSE.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrSSE.td?rev=204631&r1=204630&r2=204631&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86InstrSSE.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrSSE.td Mon Mar 24 12:54:19 2014
@@ -8431,6 +8431,31 @@ let Predicates = [HasAVX2] in {
               (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
     def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
               (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+
+    def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
+          (VPBROADCASTBrr (COPY_TO_REGCLASS
+                           (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                           VR128))>;
+    def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
+          (VPBROADCASTBYrr (COPY_TO_REGCLASS
+                            (i32 (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+                            VR128))>;
+
+    def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
+          (VPBROADCASTWrr (COPY_TO_REGCLASS
+                           (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                           VR128))>;
+    def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
+          (VPBROADCASTWYrr (COPY_TO_REGCLASS
+                            (i32 (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit)),
+                            VR128))>;
+
+    // The patterns for VPBROADCASTD are not needed because they would match
+    // the exact same thing as VBROADCASTSS patterns.
+
+    def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
+          (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+    // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
   }
 }
 

Modified: llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll?rev=204631&r1=204630&r2=204631&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx2-vbroadcast.ll Mon Mar 24 12:54:19 2014
@@ -413,3 +413,161 @@ define <4 x double> @splat_concat4(doubl
   %5 = shufflevector <2 x double> %2, <2 x double> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   ret <4 x double> %5
 }
+
+; Test cases for <rdar://problem/16074331>.
+; Instruction selection for broacast instruction fails if
+; the load cannot be folded into the broadcast.
+; This happens if the load has initial one use but other uses are
+; created later, or if selection DAG cannot prove that folding the
+; load will not create a cycle in the DAG.
+; Those test cases exerce the latter.
+
+; CHECK-LABEL: isel_crash_16b
+; CHECK: vpbroadcastb {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_16b(i8* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i8* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <16 x i8> undef, i8 %tmp2, i32 0
+  %splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %tmp3 = bitcast <16 x i8> %splat.splat to <2 x i64>
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_32b
+; CHECK: vpbroadcastb {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_32b(i8* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i8* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <32 x i8> undef, i8 %tmp2, i32 0
+  %splat.splat = shufflevector <32 x i8> %splat.splatinsert, <32 x i8> undef, <32 x i32> zeroinitializer
+  %tmp3 = bitcast <32 x i8> %splat.splat to <4 x i64>
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_8w
+; CHECK: vpbroadcastw {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_8w(i16* %cV_R.addr) {
+entry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i16* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <8 x i16> undef, i16 %tmp2, i32 0
+  %splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %tmp3 = bitcast <8 x i16> %splat.splat to <2 x i64>
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_16w
+; CHECK: vpbroadcastw {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_16w(i16* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i16* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <16 x i16> undef, i16 %tmp2, i32 0
+  %splat.splat = shufflevector <16 x i16> %splat.splatinsert, <16 x i16> undef, <16 x i32> zeroinitializer
+  %tmp3 = bitcast <16 x i16> %splat.splat to <4 x i64>
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_4d
+; CHECK: vbroadcastss {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_4d(i32* %cV_R.addr) {
+entry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i32* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %tmp2, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %tmp3 = bitcast <4 x i32> %splat.splat to <2 x i64>
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_8d
+; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_8d(i32* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i32* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <8 x i32> undef, i32 %tmp2, i32 0
+  %splat.splat = shufflevector <8 x i32> %splat.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+  %tmp3 = bitcast <8 x i32> %splat.splat to <4 x i64>
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_2q
+; CHECK: vpbroadcastq {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_2q(i64* %cV_R.addr) {
+entry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i64* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %tmp2, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %splat.splat, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_4q
+; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_4q(i64* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i64* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <4 x i64> undef, i64 %tmp2, i32 0
+  %splat.splat = shufflevector <4 x i64> %splat.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %splat.splat, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}