[llvm] r284353 - [AVX-512] Add support for turning a 256-bit load that goes to both halfs of an insert_subvector into a subvector broadcast.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sun Oct 16 16:29:52 PDT 2016


Author: ctopper
Date: Sun Oct 16 18:29:51 2016
New Revision: 284353

URL: http://llvm.org/viewvc/llvm-project?rev=284353&view=rev
Log:
[AVX-512] Add support for turning a 256-bit load that goes to both halfs of an insert_subvector into a subvector broadcast.

Differential Revision: https://reviews.llvm.org/D25650

Modified:
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti256.ll
    llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=284353&r1=284352&r2=284353&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Oct 16 18:29:51 2016
@@ -12982,17 +12982,32 @@ static SDValue LowerINSERT_SUBVECTOR(SDV
   MVT OpVT = Op.getSimpleValueType();
   MVT SubVecVT = SubVec.getSimpleValueType();
 
-  // Fold two 16-byte subvector loads into one 32-byte load:
-  // (insert_subvector (insert_subvector undef, (load addr), 0),
-  //                   (load addr + 16), Elts/2)
+  if (OpVT.getVectorElementType() == MVT::i1)
+    return insert1BitVector(Op, DAG, Subtarget);
+
+  assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
+         "Can only insert into 256-bit or 512-bit vectors");
+
+  // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
+  // load:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr + 16), Elts/2)
   // --> load32 addr
-  // or a 16-byte broadcast:
-  // (insert_subvector (insert_subvector undef, (load addr), 0),
-  //                   (load addr), Elts/2)
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr + 32), Elts/2)
+  // --> load64 addr
+  // or a 16-byte or 32-byte broadcast:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr), Elts/2)
   // --> X86SubVBroadcast(load16 addr)
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr), Elts/2)
+  // --> X86SubVBroadcast(load32 addr)
   if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
       Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
-      OpVT.is256BitVector() && SubVecVT.is128BitVector()) {
+      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
     auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
     if (Idx2 && Idx2->getZExtValue() == 0) {
       SDValue SubVec2 = Vec.getOperand(1);
@@ -13020,12 +13035,6 @@ static SDValue LowerINSERT_SUBVECTOR(SDV
     }
   }
 
-  if (OpVT.getVectorElementType() == MVT::i1)
-    return insert1BitVector(Op, DAG, Subtarget);
-
-  assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
-         "Can only insert into 256-bit or 512-bit vectors");
-
   if (SubVecVT.is128BitVector())
     return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
 

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=284353&r1=284352&r2=284353&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Sun Oct 16 18:29:51 2016
@@ -1043,6 +1043,28 @@ defm VBROADCASTF64X4 : avx512_subvec_bro
                        v8f64_info, v4f64x_info>, VEX_W,
                        EVEX_V512, EVEX_CD8<64, CD8VT4>;
 
+let Predicates = [HasAVX512] in {
+def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
+          (VBROADCASTI64X4rm addr:$src)>;
+def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
+          (VBROADCASTI64X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+          (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8f32 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+          (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8i32 VR256X:$src), 1)>;
+def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
+          (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v16i16 VR256X:$src), 1)>;
+def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))),
+          (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v32i8 VR256X:$src), 1)>;
+}
+
 let Predicates = [HasVLX] in {
 defm VBROADCASTI32X4Z256 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti32x4",
                            v8i32x_info, v4i32x_info>,
@@ -1106,6 +1128,22 @@ def : Pat<(v4i64 (X86SubVBroadcast (v2i6
                               (v2i64 VR128X:$src), 1)>;
 }
 
+let Predicates = [HasAVX512, NoDQI] in {
+def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
+          (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
+          (VBROADCASTI64X4rm addr:$src)>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+          (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8f32 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+          (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8i32 VR256X:$src), 1)>;
+}
+
 let Predicates = [HasDQI] in {
 defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
                        v8i64_info, v2i64x_info>, VEX_W,
@@ -1119,6 +1157,15 @@ defm VBROADCASTF64X2 : avx512_subvec_bro
 defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
                        v16f32_info, v8f32x_info>,
                        EVEX_V512, EVEX_CD8<32, CD8VT8>;
+
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+          (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8f32 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+          (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+                           (v8i32 VR256X:$src), 1)>;
 }
 
 multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,

Modified: llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti256.ll?rev=284353&r1=284352&r2=284353&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vbroadcasti256.ll Sun Oct 16 18:29:51 2016
@@ -6,8 +6,7 @@
 define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
 ; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vmovapd (%rdi), %ymm0
-; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512-NEXT:    vaddpd {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512-NEXT:    retq
  %1 = load <4 x double>, <4 x double> *%p
@@ -19,8 +18,7 @@ define <8 x double> @test_broadcast_4f64
 define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
 ; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vmovdqa64 (%rdi), %ymm0
-; X64-AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512-NEXT:    retq
  %1 = load <4 x i64>, <4 x i64> *%p
@@ -32,22 +30,19 @@ define <8 x i64> @test_broadcast_4i64_8i
 define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
 ; X64-AVX512VL-LABEL: test_broadcast_8f32_16f32:
 ; X64-AVX512VL:       ## BB#0:
-; X64-AVX512VL-NEXT:    vmovapd (%rdi), %ymm0
-; X64-AVX512VL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512VL-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512VL-NEXT:    vaddps {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512VL-NEXT:    retq
 ;
 ; X64-AVX512BWVL-LABEL: test_broadcast_8f32_16f32:
 ; X64-AVX512BWVL:       ## BB#0:
-; X64-AVX512BWVL-NEXT:    vmovapd (%rdi), %ymm0
-; X64-AVX512BWVL-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BWVL-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512BWVL-NEXT:    vaddps {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512BWVL-NEXT:    retq
 ;
 ; X64-AVX512DQVL-LABEL: test_broadcast_8f32_16f32:
 ; X64-AVX512DQVL:       ## BB#0:
-; X64-AVX512DQVL-NEXT:    vmovaps (%rdi), %ymm0
-; X64-AVX512DQVL-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512DQVL-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; X64-AVX512DQVL-NEXT:    vaddps {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512DQVL-NEXT:    retq
  %1 = load <8 x float>, <8 x float> *%p
@@ -59,22 +54,19 @@ define <16 x float> @test_broadcast_8f32
 define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
 ; X64-AVX512VL-LABEL: test_broadcast_8i32_16i32:
 ; X64-AVX512VL:       ## BB#0:
-; X64-AVX512VL-NEXT:    vmovdqa32 (%rdi), %ymm0
-; X64-AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512VL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512VL-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512VL-NEXT:    retq
 ;
 ; X64-AVX512BWVL-LABEL: test_broadcast_8i32_16i32:
 ; X64-AVX512BWVL:       ## BB#0:
-; X64-AVX512BWVL-NEXT:    vmovdqa32 (%rdi), %ymm0
-; X64-AVX512BWVL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BWVL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512BWVL-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512BWVL-NEXT:    retq
 ;
 ; X64-AVX512DQVL-LABEL: test_broadcast_8i32_16i32:
 ; X64-AVX512DQVL:       ## BB#0:
-; X64-AVX512DQVL-NEXT:    vmovdqa32 (%rdi), %ymm0
-; X64-AVX512DQVL-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512DQVL-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; X64-AVX512DQVL-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512DQVL-NEXT:    retq
  %1 = load <8 x i32>, <8 x i32> *%p
@@ -93,8 +85,7 @@ define <32 x i16> @test_broadcast_16i16_
 ;
 ; X64-AVX512BWVL-LABEL: test_broadcast_16i16_32i16:
 ; X64-AVX512BWVL:       ## BB#0:
-; X64-AVX512BWVL-NEXT:    vmovdqu16 (%rdi), %ymm0
-; X64-AVX512BWVL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BWVL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512BWVL-NEXT:    vpaddw {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512BWVL-NEXT:    retq
 ;
@@ -120,8 +111,7 @@ define <64 x i8> @test_broadcast_32i8_64
 ;
 ; X64-AVX512BWVL-LABEL: test_broadcast_32i8_64i8:
 ; X64-AVX512BWVL:       ## BB#0:
-; X64-AVX512BWVL-NEXT:    vmovdqu8 (%rdi), %ymm0
-; X64-AVX512BWVL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BWVL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512BWVL-NEXT:    vpaddb {{.*}}(%rip), %zmm0, %zmm0
 ; X64-AVX512BWVL-NEXT:    retq
 ;

Modified: llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll?rev=284353&r1=284352&r2=284353&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll Sun Oct 16 18:29:51 2016
@@ -131,8 +131,7 @@ define <8 x double> @test_broadcast_4f64
 ; X32-AVX512-LABEL: test_broadcast_4f64_8f64:
 ; X32-AVX512:       ## BB#0:
 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512-NEXT:    vmovapd (%eax), %ymm0
-; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: test_broadcast_4f64_8f64:
@@ -143,8 +142,7 @@ define <8 x double> @test_broadcast_4f64
 ;
 ; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vmovapd (%rdi), %ymm0
-; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512-NEXT:    retq
  %1 = load <4 x double>, <4 x double> *%p
  %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -281,8 +279,7 @@ define <8 x i64> @test_broadcast_4i64_8i
 ; X32-AVX512-LABEL: test_broadcast_4i64_8i64:
 ; X32-AVX512:       ## BB#0:
 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512-NEXT:    vmovdqa64 (%eax), %ymm0
-; X32-AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: test_broadcast_4i64_8i64:
@@ -293,8 +290,7 @@ define <8 x i64> @test_broadcast_4i64_8i
 ;
 ; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vmovdqa64 (%rdi), %ymm0
-; X64-AVX512-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512-NEXT:    retq
  %1 = load <4 x i64>, <4 x i64> *%p
  %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -396,22 +392,19 @@ define <16 x float> @test_broadcast_8f32
 ; X32-AVX512F-LABEL: test_broadcast_8f32_16f32:
 ; X32-AVX512F:       ## BB#0:
 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT:    vmovapd (%eax), %ymm0
-; X32-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512F-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X32-AVX512F-NEXT:    retl
 ;
 ; X32-AVX512BW-LABEL: test_broadcast_8f32_16f32:
 ; X32-AVX512BW:       ## BB#0:
 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT:    vmovapd (%eax), %ymm0
-; X32-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512BW-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X32-AVX512BW-NEXT:    retl
 ;
 ; X32-AVX512DQ-LABEL: test_broadcast_8f32_16f32:
 ; X32-AVX512DQ:       ## BB#0:
 ; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT:    vmovaps (%eax), %ymm0
-; X32-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512DQ-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; X32-AVX512DQ-NEXT:    retl
 ;
 ; X64-AVX-LABEL: test_broadcast_8f32_16f32:
@@ -422,20 +415,17 @@ define <16 x float> @test_broadcast_8f32
 ;
 ; X64-AVX512F-LABEL: test_broadcast_8f32_16f32:
 ; X64-AVX512F:       ## BB#0:
-; X64-AVX512F-NEXT:    vmovapd (%rdi), %ymm0
-; X64-AVX512F-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512F-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512F-NEXT:    retq
 ;
 ; X64-AVX512BW-LABEL: test_broadcast_8f32_16f32:
 ; X64-AVX512BW:       ## BB#0:
-; X64-AVX512BW-NEXT:    vmovapd (%rdi), %ymm0
-; X64-AVX512BW-NEXT:    vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BW-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512BW-NEXT:    retq
 ;
 ; X64-AVX512DQ-LABEL: test_broadcast_8f32_16f32:
 ; X64-AVX512DQ:       ## BB#0:
-; X64-AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
-; X64-AVX512DQ-NEXT:    vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512DQ-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; X64-AVX512DQ-NEXT:    retq
  %1 = load <8 x float>, <8 x float> *%p
  %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -550,22 +540,19 @@ define <16 x i32> @test_broadcast_8i32_1
 ; X32-AVX512F-LABEL: test_broadcast_8i32_16i32:
 ; X32-AVX512F:       ## BB#0:
 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT:    vmovdqa32 (%eax), %ymm0
-; X32-AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X32-AVX512F-NEXT:    retl
 ;
 ; X32-AVX512BW-LABEL: test_broadcast_8i32_16i32:
 ; X32-AVX512BW:       ## BB#0:
 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT:    vmovdqa32 (%eax), %ymm0
-; X32-AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X32-AVX512BW-NEXT:    retl
 ;
 ; X32-AVX512DQ-LABEL: test_broadcast_8i32_16i32:
 ; X32-AVX512DQ:       ## BB#0:
 ; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT:    vmovdqa32 (%eax), %ymm0
-; X32-AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512DQ-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; X32-AVX512DQ-NEXT:    retl
 ;
 ; X64-AVX-LABEL: test_broadcast_8i32_16i32:
@@ -576,20 +563,17 @@ define <16 x i32> @test_broadcast_8i32_1
 ;
 ; X64-AVX512F-LABEL: test_broadcast_8i32_16i32:
 ; X64-AVX512F:       ## BB#0:
-; X64-AVX512F-NEXT:    vmovdqa32 (%rdi), %ymm0
-; X64-AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512F-NEXT:    retq
 ;
 ; X64-AVX512BW-LABEL: test_broadcast_8i32_16i32:
 ; X64-AVX512BW:       ## BB#0:
-; X64-AVX512BW-NEXT:    vmovdqa32 (%rdi), %ymm0
-; X64-AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512BW-NEXT:    retq
 ;
 ; X64-AVX512DQ-LABEL: test_broadcast_8i32_16i32:
 ; X64-AVX512DQ:       ## BB#0:
-; X64-AVX512DQ-NEXT:    vmovdqa32 (%rdi), %ymm0
-; X64-AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512DQ-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
 ; X64-AVX512DQ-NEXT:    retq
  %1 = load <8 x i32>, <8 x i32> *%p
  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -711,8 +695,7 @@ define <32 x i16> @test_broadcast_16i16_
 ; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16:
 ; X32-AVX512BW:       ## BB#0:
 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT:    vmovdqu16 (%eax), %ymm0
-; X32-AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X32-AVX512BW-NEXT:    retl
 ;
 ; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
@@ -736,8 +719,7 @@ define <32 x i16> @test_broadcast_16i16_
 ;
 ; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16:
 ; X64-AVX512BW:       ## BB#0:
-; X64-AVX512BW-NEXT:    vmovdqu16 (%rdi), %ymm0
-; X64-AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512BW-NEXT:    retq
 ;
 ; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
@@ -865,8 +847,7 @@ define <64 x i8> @test_broadcast_32i8_64
 ; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8:
 ; X32-AVX512BW:       ## BB#0:
 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT:    vmovdqu8 (%eax), %ymm0
-; X32-AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X32-AVX512BW-NEXT:    retl
 ;
 ; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
@@ -890,8 +871,7 @@ define <64 x i8> @test_broadcast_32i8_64
 ;
 ; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8:
 ; X64-AVX512BW:       ## BB#0:
-; X64-AVX512BW-NEXT:    vmovdqu8 (%rdi), %ymm0
-; X64-AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
 ; X64-AVX512BW-NEXT:    retq
 ;
 ; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8:




More information about the llvm-commits mailing list