[llvm] r258657 - AVX512: VMOVDQU8/16/32/64 (load) intrinsic implementation.
Igor Breger via llvm-commits
llvm-commits at lists.llvm.org
Sun Jan 24 00:04:34 PST 2016
Author: ibreger
Date: Sun Jan 24 02:04:33 2016
New Revision: 258657
URL: http://llvm.org/viewvc/llvm-project?rev=258657&view=rev
Log:
AVX512: VMOVDQU8/16/32/64 (load) intrinsic implementation.
Differential Revision: http://reviews.llvm.org/D16137
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsX86.td
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.h
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
Modified: llvm/trunk/include/llvm/IR/IntrinsicsX86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsX86.td?rev=258657&r1=258656&r2=258657&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsX86.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td Sun Jan 24 02:04:33 2016
@@ -3028,12 +3028,58 @@ let TargetPrefix = "x86" in { // All in
def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">,
Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty],
[IntrReadArgMem]>;
- def int_x86_avx512_mask_loadu_d_512 : GCCBuiltin<"__builtin_ia32_loaddqusi512_mask">,
- Intrinsic<[llvm_v16i32_ty], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
- [IntrReadArgMem]>;
- def int_x86_avx512_mask_loadu_q_512 : GCCBuiltin<"__builtin_ia32_loaddqudi512_mask">,
- Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
- [IntrReadArgMem]>;
+
+ def int_x86_avx512_mask_loadu_b_128 :
+ GCCBuiltin<"__builtin_ia32_loaddquqi128_mask">,
+ Intrinsic<[llvm_v16i8_ty],
+ [llvm_ptr_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_loadu_b_256 :
+ GCCBuiltin<"__builtin_ia32_loaddquqi256_mask">,
+ Intrinsic<[llvm_v32i8_ty],
+ [llvm_ptr_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_loadu_b_512 :
+ GCCBuiltin<"__builtin_ia32_loaddquqi512_mask">,
+ Intrinsic<[llvm_v64i8_ty],
+ [llvm_ptr_ty, llvm_v64i8_ty, llvm_i64_ty], [IntrReadArgMem]>;
+
+ def int_x86_avx512_mask_loadu_w_128 :
+ GCCBuiltin<"__builtin_ia32_loaddquhi128_mask">,
+ Intrinsic<[llvm_v8i16_ty],
+ [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_loadu_w_256 :
+ GCCBuiltin<"__builtin_ia32_loaddquhi256_mask">,
+ Intrinsic<[llvm_v16i16_ty],
+ [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_loadu_w_512 :
+ GCCBuiltin<"__builtin_ia32_loaddquhi512_mask">,
+ Intrinsic<[llvm_v32i16_ty],
+ [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrReadArgMem]>;
+
+ def int_x86_avx512_mask_loadu_d_128 :
+ GCCBuiltin<"__builtin_ia32_loaddqusi128_mask">,
+ Intrinsic<[llvm_v4i32_ty],
+ [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_loadu_d_256 :
+ GCCBuiltin<"__builtin_ia32_loaddqusi256_mask">,
+ Intrinsic<[llvm_v8i32_ty],
+ [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_loadu_d_512 :
+ GCCBuiltin<"__builtin_ia32_loaddqusi512_mask">,
+ Intrinsic<[llvm_v16i32_ty],
+ [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrReadArgMem]>;
+
+ def int_x86_avx512_mask_loadu_q_128 :
+ GCCBuiltin<"__builtin_ia32_loaddqudi128_mask">,
+ Intrinsic<[llvm_v2i64_ty],
+ [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_loadu_q_256 :
+ GCCBuiltin<"__builtin_ia32_loaddqudi256_mask">,
+ Intrinsic<[llvm_v4i64_ty],
+ [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrReadArgMem]>;
+ def int_x86_avx512_mask_loadu_q_512 :
+ GCCBuiltin<"__builtin_ia32_loaddqudi512_mask">,
+ Intrinsic<[llvm_v8i64_ty],
+ [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrReadArgMem]>;
def int_x86_avx512_mask_load_d_128 :
GCCBuiltin<"__builtin_ia32_movdqa32load128_mask">,
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=258657&r1=258656&r2=258657&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sun Jan 24 02:04:33 2016
@@ -20582,6 +20582,28 @@ SDValue X86TargetLowering::LowerOperatio
}
}
+/// Places new result values for the node in Results (their number
+/// and types must exactly match those of the original return values of
+/// the node), or leaves Results empty, which indicates that the node is not
+/// to be custom lowered after all.
+void X86TargetLowering::LowerOperationWrapper(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const {
+ SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+ if (!Res.getNode())
+ return;
+
+ assert((N->getNumValues() <= Res->getNumValues()) &&
+ "Lowering returned the wrong number of results!");
+
+ // Places new result values base on N result number.
+ // In some cases (LowerSINT_TO_FP for example) Res has more result values
+ // than original node, chain should be dropped(last value).
+ for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
+ Results.push_back(Res.getValue(I));
+}
+
/// ReplaceNodeResults - Replace a node with an illegal result type
/// with a new node built out of custom code.
void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.h?rev=258657&r1=258656&r2=258657&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.h (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.h Sun Jan 24 02:04:33 2016
@@ -680,6 +680,14 @@ namespace llvm {
///
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+ /// Places new result values for the node in Results (their number
+ /// and types must exactly match those of the original return values of
+ /// the node), or leaves Results empty, which indicates that the node is not
+ /// to be custom lowered after all.
+ virtual void LowerOperationWrapper(SDNode *N,
+ SmallVectorImpl<SDValue> &Results,
+ SelectionDAG &DAG) const override;
+
/// Replace the results of node with an illegal result
/// type with new values built out of custom code.
///
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=258657&r1=258656&r2=258657&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Sun Jan 24 02:04:33 2016
@@ -2752,14 +2752,6 @@ defm VMOVDQU64 : avx512_load_vl<0x6F, "v
avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
-def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
- (v16i32 immAllZerosV), GR16:$mask)),
- (VMOVDQU32Zrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
-
-def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
- (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
- (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
-
let AddedComplexity = 20 in {
def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
(v8i64 VR512:$src))),
Modified: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h?rev=258657&r1=258656&r2=258657&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h Sun Jan 24 02:04:33 2016
@@ -156,12 +156,24 @@ static const IntrinsicData IntrinsicsWit
X86_INTRINSIC_DATA(avx512_mask_load_q_128, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_q_256, LOADA, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_load_q_512, LOADA, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_b_128, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_b_256, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_b_512, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_d_128, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_d_256, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_d_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_q_128, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_q_256, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_q_512, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_w_128, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_w_256, LOADU, ISD::DELETED_NODE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_loadu_w_512, LOADU, ISD::DELETED_NODE, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=258657&r1=258656&r2=258657&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Sun Jan 24 02:04:33 2016
@@ -6603,6 +6603,42 @@ define <8 x i64>@test_int_x86_avx512_mas
ret <8 x i64> %res4
}
+declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8*, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_load_unaligned_d(i8* %ptr, i8* %ptr2, <16 x i32> %data, i16 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_d:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0
+; CHECK-NEXT: vmovdqu32 (%rsi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 -1)
+ %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr2, <16 x i32> %res, i16 %mask)
+ %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(i8* %ptr, <16 x i32> zeroinitializer, i16 %mask)
+ %res4 = add <16 x i32> %res2, %res1
+ ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8*, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_load_unaligned_q(i8* %ptr, i8* %ptr2, <8 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_q:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0
+; CHECK-NEXT: vmovdqu64 (%rsi), %zmm0 {%k1}
+; CHECK-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} {z}
+; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 -1)
+ %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr2, <8 x i64> %res, i8 %mask)
+ %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(i8* %ptr, <8 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i64> %res2, %res1
+ ret <8 x i64> %res4
+}
+
declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i8, <16 x i32>, i16)
define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i8 %x1, <16 x i32> %x2, i16 %x3) {
Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll?rev=258657&r1=258656&r2=258657&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll Sun Jan 24 02:04:33 2016
@@ -3087,6 +3087,66 @@ define <32 x i16>@test_int_x86_avx512_ma
ret <32 x i16> %res4
}
+declare <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8*, <32 x i16>, i32)
+
+define <32 x i16>@test_int_x86_avx512_mask_loadu_w_512(i8* %ptr, i8* %ptr2, <32 x i16> %x1, i32 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_w_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovd %edx, %k1
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
+; AVX512BW-NEXT: vmovdqu16 (%rsi), %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_w_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm0
+; AVX512F-32-NEXT: vmovdqu16 (%eax), %zmm0 {%k1}
+; AVX512F-32-NEXT: vmovdqu16 (%ecx), %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res0 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> %x1, i32 -1)
+ %res = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr2, <32 x i16> %res0, i32 %mask)
+ %res1 = call <32 x i16> @llvm.x86.avx512.mask.loadu.w.512(i8* %ptr, <32 x i16> zeroinitializer, i32 %mask)
+ %res2 = add <32 x i16> %res, %res1
+ ret <32 x i16> %res2
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8*, <64 x i8>, i64)
+
+define <64 x i8>@test_int_x86_avx512_mask_loadu_b_512(i8* %ptr, i8* %ptr2, <64 x i8> %x1, i64 %mask) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_loadu_b_512:
+; AVX512BW: ## BB#0:
+; AVX512BW-NEXT: kmovq %rdx, %k1
+; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
+; AVX512BW-NEXT: vmovdqu8 (%rsi), %zmm0 {%k1}
+; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_loadu_b_512:
+; AVX512F-32: # BB#0:
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT: vmovdqu8 (%eax), %zmm0 {%k1}
+; AVX512F-32-NEXT: vmovdqu8 (%ecx), %zmm1 {%k1} {z}
+; AVX512F-32-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT: retl
+ %res0 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> %x1, i64 -1)
+ %res = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr2, <64 x i8> %res0, i64 %mask)
+ %res1 = call <64 x i8> @llvm.x86.avx512.mask.loadu.b.512(i8* %ptr, <64 x i8> zeroinitializer, i64 %mask)
+ %res2 = add <64 x i8> %res, %res1
+ ret <64 x i8> %res2
+}
+
declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
@@ -3244,4 +3304,3 @@ define <64 x i8>@test_int_x86_avx512_mas
%res2 = add <64 x i8> %res, %res1
ret <64 x i8> %res2
}
-
Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll?rev=258657&r1=258656&r2=258657&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll Sun Jan 24 02:04:33 2016
@@ -4947,6 +4947,78 @@ define <8 x i16>@test_int_x86_avx512_mas
ret <8 x i16> %res4
}
+declare <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8*, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x i16> %x1, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm0
+; CHECK-NEXT: vmovdqu16 (%rsi), %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu16 (%rdi), %xmm1 {%k1} {z}
+; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> %x1, i8 -1)
+ %res = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr2, <8 x i16> %res0, i8 %mask)
+ %res1 = call <8 x i16> @llvm.x86.avx512.mask.loadu.w.128(i8* %ptr, <8 x i16> zeroinitializer, i8 %mask)
+ %res2 = add <8 x i16> %res, %res1
+ ret <8 x i16> %res2
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8*, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16 x i16> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm0
+; CHECK-NEXT: vmovdqu16 (%rsi), %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu16 (%rdi), %ymm1 {%k1} {z}
+; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> %x1, i16 -1)
+ %res = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr2, <16 x i16> %res0, i16 %mask)
+ %res1 = call <16 x i16> @llvm.x86.avx512.mask.loadu.w.256(i8* %ptr, <16 x i16> zeroinitializer, i16 %mask)
+ %res2 = add <16 x i16> %res, %res1
+ ret <16 x i16> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8*, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x i8> %x1, i16 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm0
+; CHECK-NEXT: vmovdqu8 (%rsi), %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu8 (%rdi), %xmm1 {%k1} {z}
+; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: retq
+ %res0 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> %x1, i16 -1)
+ %res = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr2, <16 x i8> %res0, i16 %mask)
+ %res1 = call <16 x i8> @llvm.x86.avx512.mask.loadu.b.128(i8* %ptr, <16 x i8> zeroinitializer, i16 %mask)
+ %res2 = add <16 x i8> %res, %res1
+ ret <16 x i8> %res2
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8*, <32 x i8>, i32)
+
+define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x i8> %x1, i32 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovd %edx, %k1
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm0
+; CHECK-NEXT: vmovdqu8 (%rsi), %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu8 (%rdi), %ymm1 {%k1} {z}
+; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; CHECK-NEXT: retq
+ %res0 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> %x1, i32 -1)
+ %res = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr2, <32 x i8> %res0, i32 %mask)
+ %res1 = call <32 x i8> @llvm.x86.avx512.mask.loadu.b.256(i8* %ptr, <32 x i8> zeroinitializer, i32 %mask)
+ %res2 = add <32 x i8> %res, %res1
+ ret <32 x i8> %res2
+}
+
declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) {
Modified: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll?rev=258657&r1=258656&r2=258657&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll Sun Jan 24 02:04:33 2016
@@ -6672,6 +6672,79 @@ define <4 x i64>@test_int_x86_avx512_mas
%res4 = add <4 x i64> %res3, %res2
ret <4 x i64> %res4
}
+
+declare <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8*, <4 x i32>, i8)
+
+define <4 x i32> @test_mask_load_unaligned_d_128(i8* %ptr, i8* %ptr2, <4 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_d_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm0
+; CHECK-NEXT: vmovdqu32 (%rsi), %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu32 (%rdi), %xmm1 {%k1} {z}
+; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 -1)
+ %res1 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr2, <4 x i32> %res, i8 %mask)
+ %res2 = call <4 x i32> @llvm.x86.avx512.mask.loadu.d.128(i8* %ptr, <4 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i32> %res2, %res1
+ ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8*, <8 x i32>, i8)
+
+define <8 x i32> @test_mask_load_unaligned_d_256(i8* %ptr, i8* %ptr2, <8 x i32> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_d_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm0
+; CHECK-NEXT: vmovdqu32 (%rsi), %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu32 (%rdi), %ymm1 {%k1} {z}
+; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 -1)
+ %res1 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr2, <8 x i32> %res, i8 %mask)
+ %res2 = call <8 x i32> @llvm.x86.avx512.mask.loadu.d.256(i8* %ptr, <8 x i32> zeroinitializer, i8 %mask)
+ %res4 = add <8 x i32> %res2, %res1
+ ret <8 x i32> %res4
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8*, <2 x i64>, i8)
+
+define <2 x i64> @test_mask_load_unaligned_q_128(i8* %ptr, i8* %ptr2, <2 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_q_128:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm0
+; CHECK-NEXT: vmovdqu64 (%rsi), %xmm0 {%k1}
+; CHECK-NEXT: vmovdqu64 (%rdi), %xmm1 {%k1} {z}
+; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: retq
+ %res = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 -1)
+ %res1 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr2, <2 x i64> %res, i8 %mask)
+ %res2 = call <2 x i64> @llvm.x86.avx512.mask.loadu.q.128(i8* %ptr, <2 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <2 x i64> %res2, %res1
+ ret <2 x i64> %res4
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8*, <4 x i64>, i8)
+
+define <4 x i64> @test_mask_load_unaligned_q_256(i8* %ptr, i8* %ptr2, <4 x i64> %data, i8 %mask) {
+; CHECK-LABEL: test_mask_load_unaligned_q_256:
+; CHECK: ## BB#0:
+; CHECK-NEXT: kmovw %edx, %k1
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm0
+; CHECK-NEXT: vmovdqu64 (%rsi), %ymm0 {%k1}
+; CHECK-NEXT: vmovdqu64 (%rdi), %ymm1 {%k1} {z}
+; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %res = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 -1)
+ %res1 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr2, <4 x i64> %res, i8 %mask)
+ %res2 = call <4 x i64> @llvm.x86.avx512.mask.loadu.q.256(i8* %ptr, <4 x i64> zeroinitializer, i8 %mask)
+ %res4 = add <4 x i64> %res2, %res1
+ ret <4 x i64> %res4
+}
+
declare <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32>, i8, <4 x i32>, i8)
define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i8 %x1, <4 x i32> %x2, i8 %x3) {
More information about the llvm-commits
mailing list