[llvm] r292466 - [AVX-512] Use VSHUF instructions instead of two inserts as fallback for subvector broadcasts that can't fold the load.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 18 18:34:30 PST 2017
Author: ctopper
Date: Wed Jan 18 20:34:29 2017
New Revision: 292466
URL: http://llvm.org/viewvc/llvm-project?rev=292466&view=rev
Log:
[AVX-512] Use VSHUF instructions instead of two inserts as fallback for subvector broadcasts that can't fold the load.
Modified:
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=292466&r1=292465&r2=292466&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Wed Jan 18 20:34:29 2017
@@ -1092,46 +1092,6 @@ def : Pat<(v32i16 (X86SubVBroadcast (bc_
(VBROADCASTI32X4rm addr:$src)>;
def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
(VBROADCASTI32X4rm addr:$src)>;
-
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
- (VINSERTF64x4Zrr
- (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1),
- (EXTRACT_SUBREG
- (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
- (VINSERTI64x4Zrr
- (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1),
- (EXTRACT_SUBREG
- (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1)), sub_ymm), 1)>;
-
-def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
- (VINSERTI64x4Zrr
- (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1),
- (EXTRACT_SUBREG
- (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
- (VINSERTI64x4Zrr
- (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1),
- (EXTRACT_SUBREG
- (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1)), sub_ymm), 1)>;
}
let Predicates = [HasVLX] in {
@@ -1203,25 +1163,6 @@ def : Pat<(v8f64 (X86SubVBroadcast (load
def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
(VBROADCASTI32X4rm addr:$src)>;
-def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
- (VINSERTF64x4Zrr
- (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1),
- (EXTRACT_SUBREG
- (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
- (VINSERTI64x4Zrr
- (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1),
- (EXTRACT_SUBREG
- (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1)), sub_ymm), 1)>;
-
def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
(VBROADCASTF64X4rm addr:$src)>;
def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
@@ -1259,25 +1200,6 @@ def : Pat<(v16f32 (X86SubVBroadcast (v8f
def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
(VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(v8i32 VR256X:$src), 1)>;
-
-def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
- (VINSERTF32x8Zrr
- (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1),
- (EXTRACT_SUBREG
- (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1)), sub_ymm), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
- (VINSERTI32x8Zrr
- (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1),
- (EXTRACT_SUBREG
- (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
- VR128X:$src, sub_xmm),
- VR128X:$src, 1)), sub_ymm), 1)>;
}
multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
@@ -8471,6 +8393,39 @@ defm VSHUFI32X4 : avx512_shuff_packed_12
defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+let Predicates = [HasAVX512] in {
+// Provide fallback in case the load node that is used in the broadcast
+// patterns above is used by additional users, which prevents the pattern
+// selection.
+def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+ (VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ 0)>;
+def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+ (VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ 0)>;
+
+def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
+ (VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ 0)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
+ (VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ 0)>;
+
+def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))),
+ (VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ 0)>;
+
+def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
+ (VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ 0)>;
+}
+
multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
AVX512AIi8Base, EVEX_4V;
Modified: llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll?rev=292466&r1=292465&r2=292466&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll (original)
+++ llvm/trunk/test/CodeGen/X86/subvector-broadcast.ll Wed Jan 18 20:34:29 2017
@@ -1225,8 +1225,7 @@ define <16 x i32> @test_broadcast_4i32_1
; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax)
-; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0
-; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
@@ -1236,8 +1235,7 @@ define <16 x i32> @test_broadcast_4i32_1
; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax)
-; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0
-; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
@@ -1247,8 +1245,7 @@ define <16 x i32> @test_broadcast_4i32_1
; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0
; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax)
-; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0
-; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0
+; X32-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X32-AVX512DQ-NEXT: retl
;
; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
@@ -1265,8 +1262,7 @@ define <16 x i32> @test_broadcast_4i32_1
; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi)
-; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0
-; X64-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
@@ -1274,8 +1270,7 @@ define <16 x i32> @test_broadcast_4i32_1
; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi)
-; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0
-; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
@@ -1283,8 +1278,7 @@ define <16 x i32> @test_broadcast_4i32_1
; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
-; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0
-; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0
+; X64-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; X64-AVX512DQ-NEXT: retq
%1 = load <4 x i32>, <4 x i32>* %p0
store <4 x float> zeroinitializer, <4 x float>* %p1
More information about the llvm-commits
mailing list