[llvm] r333346 - [X86] Remove masking from avx512ifma intrinsics. Use a select instead.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sat May 26 11:55:19 PDT 2018
Author: ctopper
Date: Sat May 26 11:55:19 2018
New Revision: 333346
URL: http://llvm.org/viewvc/llvm-project?rev=333346&view=rev
Log:
[X86] Remove masking from avx512ifma intrinsics. Use a select instead.
This allows us to avoid having mask and maskz variant. Reducing from 12 intrinsics to 6.
Added:
llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll
- copied, changed from r333344, llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll
llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll
llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll
- copied, changed from r333344, llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsX86.td
llvm/trunk/lib/IR/AutoUpgrade.cpp
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll
llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
Modified: llvm/trunk/include/llvm/IR/IntrinsicsX86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsX86.td?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsX86.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td Sat May 26 11:55:19 2018
@@ -2657,54 +2657,30 @@ let TargetPrefix = "x86" in { // All in
[llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty,
llvm_i32_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52h_uq_128 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq128_mask">,
+ def int_x86_avx512_vpmadd52h_uq_128 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52huq128">,
Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52h_uq_128 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq128_maskz">,
+ llvm_v2i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52l_uq_128 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52luq128">,
Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52l_uq_128 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq128_mask">,
- Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52l_uq_128 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq128_maskz">,
- Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
- llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52h_uq_256 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq256_mask">,
- Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52h_uq_256 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq256_maskz">,
+ llvm_v2i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52h_uq_256 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52huq256">,
Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52l_uq_256 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq256_mask">,
+ llvm_v4i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52l_uq_256 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52luq256">,
Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52l_uq_256 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq256_maskz">,
- Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
- llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52h_uq_512 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq512_mask">,
- Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
- llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52h_uq_512 :
- GCCBuiltin<"__builtin_ia32_vpmadd52huq512_maskz">,
- Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
- llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_mask_vpmadd52l_uq_512 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq512_mask">,
+ llvm_v4i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52h_uq_512 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52huq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
- llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx512_maskz_vpmadd52l_uq_512 :
- GCCBuiltin<"__builtin_ia32_vpmadd52luq512_maskz">,
+ llvm_v8i64_ty], [IntrNoMem]>;
+ def int_x86_avx512_vpmadd52l_uq_512 :
+ GCCBuiltin<"__builtin_ia32_vpmadd52luq512">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
- llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+ llvm_v8i64_ty], [IntrNoMem]>;
}
// VNNI
Modified: llvm/trunk/lib/IR/AutoUpgrade.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/IR/AutoUpgrade.cpp?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/lib/IR/AutoUpgrade.cpp (original)
+++ llvm/trunk/lib/IR/AutoUpgrade.cpp Sat May 26 11:55:19 2018
@@ -265,6 +265,8 @@ static bool ShouldUpgradeX86Intrinsic(Fu
Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
Name.startswith("avx512.mask.pternlog.") || // Added in 7.0
Name.startswith("avx512.maskz.pternlog.") || // Added in 7.0
+ Name.startswith("avx512.mask.vpmadd52") || // Added in 7.0
+ Name.startswith("avx512.maskz.vpmadd52") || // Added in 7.0
Name == "sse.cvtsi2ss" || // Added in 7.0
Name == "sse.cvtsi642ss" || // Added in 7.0
Name == "sse2.cvtsi2sd" || // Added in 7.0
@@ -2569,6 +2571,34 @@ void llvm::UpgradeIntrinsicCall(CallInst
Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
: CI->getArgOperand(0);
Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
+ } else if (IsX86 && (Name.startswith("avx512.mask.vpmadd52") ||
+ Name.startswith("avx512.maskz.vpmadd52"))) {
+ bool ZeroMask = Name[11] == 'z';
+ bool High = Name[20] == 'h' || Name[21] == 'h';
+ unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+ Intrinsic::ID IID;
+ if (VecWidth == 128 && !High)
+ IID = Intrinsic::x86_avx512_vpmadd52l_uq_128;
+ else if (VecWidth == 256 && !High)
+ IID = Intrinsic::x86_avx512_vpmadd52l_uq_256;
+ else if (VecWidth == 512 && !High)
+ IID = Intrinsic::x86_avx512_vpmadd52l_uq_512;
+ else if (VecWidth == 128 && High)
+ IID = Intrinsic::x86_avx512_vpmadd52h_uq_128;
+ else if (VecWidth == 256 && High)
+ IID = Intrinsic::x86_avx512_vpmadd52h_uq_256;
+ else if (VecWidth == 512 && High)
+ IID = Intrinsic::x86_avx512_vpmadd52h_uq_512;
+ else
+ llvm_unreachable("Unexpected intrinsic");
+
+ Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1),
+ CI->getArgOperand(2) };
+ Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+ Args);
+ Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+ : CI->getArgOperand(0);
+ Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
} else if (IsX86 && Name.startswith("avx512.mask.") &&
upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
// Rep will be updated by the call in the condition.
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat May 26 11:55:19 2018
@@ -20624,26 +20624,11 @@ SDValue X86TargetLowering::LowerINTRINSI
Src3),
Mask, PassThru, Subtarget, DAG);
}
- case IFMA_OP_MASKZ:
- case IFMA_OP_MASK: {
- SDValue Src1 = Op.getOperand(1);
- SDValue Src2 = Op.getOperand(2);
- SDValue Src3 = Op.getOperand(3);
- SDValue Mask = Op.getOperand(4);
- MVT VT = Op.getSimpleValueType();
- SDValue PassThru = Src1;
-
- // set PassThru element
- if (IntrData->Type == IFMA_OP_MASKZ)
- PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-
- // Node we need to swizzle the operands to pass the multiply operands
+ case IFMA_OP:
+ // NOTE: We need to swizzle the operands to pass the multiply operands
// first.
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
- dl, Op.getValueType(),
- Src2, Src3, Src1),
- Mask, PassThru, Subtarget, DAG);
- }
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
case CVTPD2PS:
// ISD::FP_ROUND has a second argument that indicates if the truncation
// does not change the value. Set it to 0 since it can change.
Modified: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h Sat May 26 11:55:19 2018
@@ -30,7 +30,7 @@ enum IntrinsicType : uint16_t {
INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_IMM8_MASK,
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
- IFMA_OP_MASK, IFMA_OP_MASKZ,
+ IFMA_OP,
VPERM_2OP, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
@@ -1133,18 +1133,6 @@ static const IntrinsicData IntrinsicsWi
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK,
- X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
@@ -1325,18 +1313,6 @@ static const IntrinsicData IntrinsicsWi
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ,
- X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ,
- X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ,
- X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
@@ -1461,6 +1437,12 @@ static const IntrinsicData IntrinsicsWi
X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_128 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_256 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_512 , IFMA_OP, X86ISD::VPMADD52H, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0),
+ X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0),
X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, ISD::FMA, 0),
X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0),
X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0),
Added: llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll?rev=333346&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll Sat May 26 11:55:19 2018
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512ifma | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512ifma-builtins.c
+
+define <8 x i64> @test_mm512_madd52hi_epu64(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+ ret <8 x i64> %0
+}
+
+define <8 x i64> @test_mm512_mask_madd52hi_epu64(<8 x i64> %__W, i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y) {
+; X32-LABEL: test_mm512_mask_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__W, <8 x i64> %__X, <8 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_maskz_madd52hi_epu64(i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_maskz_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_madd52lo_epu64(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+ ret <8 x i64> %0
+}
+
+define <8 x i64> @test_mm512_mask_madd52lo_epu64(<8 x i64> %__W, i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y) {
+; X32-LABEL: test_mm512_mask_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_mask_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__W, <8 x i64> %__X, <8 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
+ ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_maskz_madd52lo_epu64(i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_maskz_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm512_maskz_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
+ ret <8 x i64> %2
+}
+
+declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
Copied: llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll (from r333344, llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll)
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll?p2=llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll&p1=llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll&r1=333344&r2=333346&rev=333346&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll Sat May 26 11:55:19 2018
@@ -6,9 +6,9 @@ declare <8 x i64> @llvm.x86.avx512.mask.
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -34,9 +34,9 @@ declare <8 x i64> @llvm.x86.avx512.maskz
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -62,9 +62,9 @@ declare <8 x i64> @llvm.x86.avx512.mask.
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -90,9 +90,9 @@ declare <8 x i64> @llvm.x86.avx512.maskz
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
Modified: llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll Sat May 26 11:55:19 2018
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512ifma | FileCheck %s
-declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -19,24 +19,28 @@ define <8 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res4 = add <8 x i64> %res, %res1
- %res5 = add <8 x i64> %res3, %res2
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %x0
+ %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+ %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %res4 = add <8 x i64> %3, %6
+ %res5 = add <8 x i64> %10, %9
%res6 = add <8 x i64> %res5, %res4
ret <8 x i64> %res6
}
-declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -47,24 +51,30 @@ define <8 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res4 = add <8 x i64> %res, %res1
- %res5 = add <8 x i64> %res3, %res2
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
+ %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+ %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %res4 = add <8 x i64> %3, %6
+ %res5 = add <8 x i64> %10, %9
%res6 = add <8 x i64> %res5, %res4
ret <8 x i64> %res6
}
-declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -75,24 +85,28 @@ define <8 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res4 = add <8 x i64> %res, %res1
- %res5 = add <8 x i64> %res3, %res2
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %x0
+ %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+ %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %res4 = add <8 x i64> %3, %6
+ %res5 = add <8 x i64> %10, %9
%res6 = add <8 x i64> %res5, %res4
ret <8 x i64> %res6
}
-declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4
; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -103,12 +117,18 @@ define <8 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
- %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- %res4 = add <8 x i64> %res, %res1
- %res5 = add <8 x i64> %res3, %res2
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
+ %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+ %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %res4 = add <8 x i64> %3, %6
+ %res5 = add <8 x i64> %10, %9
%res6 = add <8 x i64> %res5, %res4
ret <8 x i64> %res6
}
@@ -120,8 +140,8 @@ define <8 x i64>@test_int_x86_avx512_vpm
; CHECK-NEXT: retq
%x2 = load <8 x i64>, <8 x i64>* %x2ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ ret <8 x i64> %1
}
define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr) {
@@ -133,8 +153,8 @@ define <8 x i64>@test_int_x86_avx512_vpm
%x2load = load i64, i64* %x2ptr
%x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
%x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ ret <8 x i64> %1
}
define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2) {
@@ -144,8 +164,8 @@ define <8 x i64>@test_int_x86_avx512_vpm
; CHECK-NEXT: retq
%x1 = load <8 x i64>, <8 x i64>* %x1ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ ret <8 x i64> %1
}
define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2) {
@@ -157,8 +177,8 @@ define <8 x i64>@test_int_x86_avx512_vpm
%x1load = load i64, i64* %x1ptr
%x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
%x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ ret <8 x i64> %1
}
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
@@ -169,8 +189,10 @@ define <8 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: retq
%x2 = load <8 x i64>, <8 x i64>* %x2ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
@@ -183,8 +205,10 @@ define <8 x i64>@test_int_x86_avx512_mas
%x2load = load i64, i64* %x2ptr
%x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
%x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -195,8 +219,10 @@ define <8 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: retq
%x1 = load <8 x i64>, <8 x i64>* %x1ptr
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -209,8 +235,10 @@ define <8 x i64>@test_int_x86_avx512_mas
%x1load = load i64, i64* %x1ptr
%x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
%x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
@@ -221,8 +249,10 @@ define <8 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: retq
%x2 = load <8 x i64>, <8 x i64>* %x2ptr
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
@@ -235,8 +265,10 @@ define <8 x i64>@test_int_x86_avx512_mas
%x2load = load i64, i64* %x2ptr
%x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
%x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -247,8 +279,10 @@ define <8 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: retq
%x1 = load <8 x i64>, <8 x i64>* %x1ptr
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
}
define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -261,6 +295,8 @@ define <8 x i64>@test_int_x86_avx512_mas
%x1load = load i64, i64* %x1ptr
%x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
%x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
- %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
- ret <8 x i64> %res
+ %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+ ret <8 x i64> %3
}
Added: llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll?rev=333346&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll Sat May 26 11:55:19 2018
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512ifmavl-builtins.c
+
+define <2 x i64> @test_mm_madd52hi_epu64(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+ ret <2 x i64> %0
+}
+
+define <2 x i64> @test_mm_mask_madd52hi_epu64(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) {
+; X32-LABEL: test_mm_mask_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__W, <2 x i64> %__X, <2 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
+ ret <2 x i64> %2
+}
+
+define <2 x i64> @test_mm_maskz_madd52hi_epu64(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_maskz_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @test_mm256_madd52hi_epu64(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+ ret <4 x i64> %0
+}
+
+define <4 x i64> @test_mm256_mask_madd52hi_epu64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) {
+; X32-LABEL: test_mm256_mask_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__W, <4 x i64> %__X, <4 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @test_mm256_maskz_madd52hi_epu64(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_maskz_madd52hi_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_madd52hi_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
+ ret <4 x i64> %2
+}
+
+define <2 x i64> @test_mm_madd52lo_epu64(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+ ret <2 x i64> %0
+}
+
+define <2 x i64> @test_mm_mask_madd52lo_epu64(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) {
+; X32-LABEL: test_mm_mask_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_mask_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__W, <2 x i64> %__X, <2 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
+ ret <2 x i64> %2
+}
+
+define <2 x i64> @test_mm_maskz_madd52lo_epu64(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_maskz_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm_maskz_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+ %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
+ ret <2 x i64> %2
+}
+
+define <4 x i64> @test_mm256_madd52lo_epu64(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+ ret <4 x i64> %0
+}
+
+define <4 x i64> @test_mm256_mask_madd52lo_epu64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) {
+; X32-LABEL: test_mm256_mask_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_mask_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__W, <4 x i64> %__X, <4 x i64> %__Y)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
+ ret <4 x i64> %2
+}
+
+define <4 x i64> @test_mm256_maskz_madd52lo_epu64(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_maskz_madd52lo_epu64:
+; X32: # %bb.0: # %entry
+; X32-NEXT: movb {{[0-9]+}}(%esp), %al
+; X32-NEXT: kmovw %eax, %k1
+; X32-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT: retl
+;
+; X64-LABEL: test_mm256_maskz_madd52lo_epu64:
+; X64: # %bb.0: # %entry
+; X64-NEXT: kmovw %edi, %k1
+; X64-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT: retq
+entry:
+ %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+ %1 = bitcast i8 %__M to <8 x i1>
+ %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
+ ret <4 x i64> %2
+}
+
+declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
Copied: llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll (from r333344, llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll)
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll?p2=llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll&p1=llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll&r1=333344&r2=333346&rev=333346&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll Sat May 26 11:55:19 2018
@@ -6,9 +6,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.
define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -34,9 +34,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.
define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -62,9 +62,9 @@ declare <2 x i64> @llvm.x86.avx512.maskz
define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -90,9 +90,9 @@ declare <4 x i64> @llvm.x86.avx512.maskz
define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -118,9 +118,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.
define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -146,9 +146,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.
define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -174,9 +174,9 @@ declare <2 x i64> @llvm.x86.avx512.maskz
define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -202,9 +202,9 @@ declare <4 x i64> @llvm.x86.avx512.maskz
define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
Modified: llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll Sat May 26 11:55:19 2018
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s
-declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -19,24 +19,33 @@ define <2 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res4 = add <2 x i64> %res, %res1
- %res5 = add <2 x i64> %res3, %res2
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+ %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> %x0
+ %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+ %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> %x0
+ %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+ %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+ %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %res4 = add <2 x i64> %3, %6
+ %res5 = add <2 x i64> %10, %9
%res6 = add <2 x i64> %res5, %res4
ret <2 x i64> %res6
}
-declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -47,24 +56,31 @@ define <4 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res4 = add <4 x i64> %res, %res1
- %res5 = add <4 x i64> %res3, %res2
+ %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> %x0
+ %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> %x0
+ %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+ %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %res4 = add <4 x i64> %3, %6
+ %res5 = add <4 x i64> %10, %9
%res6 = add <4 x i64> %res5, %res4
ret <4 x i64> %res6
}
-declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -75,24 +91,31 @@ define <2 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res4 = add <2 x i64> %res, %res1
- %res5 = add <2 x i64> %res3, %res2
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+ %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> zeroinitializer
+ %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+ %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> zeroinitializer
+ %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+ %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+ %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %res4 = add <2 x i64> %3, %6
+ %res5 = add <2 x i64> %10, %9
%res6 = add <2 x i64> %res5, %res4
ret <2 x i64> %res6
}
-declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -103,24 +126,33 @@ define <4 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res4 = add <4 x i64> %res, %res1
- %res5 = add <4 x i64> %res3, %res2
+ %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> zeroinitializer
+ %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> zeroinitializer
+ %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+ %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %res4 = add <4 x i64> %3, %6
+ %res5 = add <4 x i64> %10, %9
%res6 = add <4 x i64> %res5, %res4
ret <4 x i64> %res6
}
-declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -131,24 +163,33 @@ define <2 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res4 = add <2 x i64> %res, %res1
- %res5 = add <2 x i64> %res3, %res2
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+ %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> %x0
+ %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+ %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> %x0
+ %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+ %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+ %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %res4 = add <2 x i64> %3, %6
+ %res5 = add <2 x i64> %10, %9
%res6 = add <2 x i64> %res5, %res4
ret <2 x i64> %res6
}
-declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -159,24 +200,31 @@ define <4 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res4 = add <4 x i64> %res, %res1
- %res5 = add <4 x i64> %res3, %res2
+ %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> %x0
+ %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> %x0
+ %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+ %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %res4 = add <4 x i64> %3, %6
+ %res5 = add <4 x i64> %10, %9
%res6 = add <4 x i64> %res5, %res4
ret <4 x i64> %res6
}
-declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm3
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %xmm0, %xmm4
; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -187,24 +235,31 @@ define <2 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retq
- %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
- %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
- %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
- %res4 = add <2 x i64> %res, %res1
- %res5 = add <2 x i64> %res3, %res2
+ %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+ %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> zeroinitializer
+ %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+ %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> zeroinitializer
+ %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+ %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+ %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+ %res4 = add <2 x i64> %3, %6
+ %res5 = add <2 x i64> %10, %9
%res6 = add <2 x i64> %res5, %res4
ret <2 x i64> %res6
}
-declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
; CHECK: ## %bb.0:
-; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm3
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmovdqa %ymm0, %ymm4
; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2
@@ -215,12 +270,21 @@ define <4 x i64>@test_int_x86_avx512_mas
; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; CHECK-NEXT: retq
- %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
- %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
- %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
- %res4 = add <4 x i64> %res, %res1
- %res5 = add <4 x i64> %res3, %res2
+ %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %2 = bitcast i8 %x3 to <8 x i1>
+ %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> zeroinitializer
+ %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %5 = bitcast i8 %x3 to <8 x i1>
+ %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> zeroinitializer
+ %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+ %8 = bitcast i8 %x3 to <8 x i1>
+ %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+ %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+ %res4 = add <4 x i64> %3, %6
+ %res5 = add <4 x i64> %10, %9
%res6 = add <4 x i64> %res5, %res4
ret <4 x i64> %res6
}
More information about the llvm-commits
mailing list