[llvm] r219316 - [AVX512] Added intrinsics for 128-, 256- and 512-bit versions of VPCMP/VPCMPU{BWDQ}
Adam Nemet
anemet at apple.com
Wed Oct 8 12:54:36 PDT 2014
On Oct 8, 2014, at 10:49 AM, Adam Nemet <anemet at apple.com> wrote:
>
> On Oct 8, 2014, at 8:49 AM, Robert Khasanov <rob.khasanov at gmail.com> wrote:
>
>> Author: rkhasanov
>> Date: Wed Oct 8 10:49:26 2014
>> New Revision: 219316
>>
>> URL: http://llvm.org/viewvc/llvm-project?rev=219316&view=rev
>> Log:
>> [AVX512] Added intrinsics for 128-, 256- and 512-bit versions of VPCMP/VPCMPU{BWDQ}
>> Added CMP_MASK_CC intrinsic type.
>> Added tests for intrinsics.
>>
>> Patch by Sergey Lisitsyn <sergey.lisitsyn at intel.com>
>>
>> Modified:
>> llvm/trunk/include/llvm/IR/IntrinsicsX86.td
>> llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>> llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
>> llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
>> llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
>> llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
>> llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
>>
>> Modified: llvm/trunk/include/llvm/IR/IntrinsicsX86.td
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsX86.td?rev=219316&r1=219315&r2=219316&view=diff
>> ==============================================================================
>> --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td (original)
>> +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td Wed Oct 8 10:49:26 2014
>> @@ -3263,6 +3263,32 @@ let TargetPrefix = "x86" in {
>> Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
>> [IntrNoMem]>;
>>
>> + def int_x86_avx512_mask_cmp_b_512: GCCBuiltin<"__builtin_ia32_cmpb512_mask">,
>> + Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty,
>> + llvm_i64_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_cmp_w_512: GCCBuiltin<"__builtin_ia32_cmpw512_mask">,
>> + Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty,
>> + llvm_i32_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_cmp_d_512: GCCBuiltin<"__builtin_ia32_cmpd512_mask">,
>> + Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
>> + llvm_i16_ty], [IntrNoMem ]>;
>> + def int_x86_avx512_mask_cmp_q_512: GCCBuiltin<"__builtin_ia32_cmpq512_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> +
>> + def int_x86_avx512_mask_ucmp_b_512: GCCBuiltin<"__builtin_ia32_ucmpb512_mask">,
>> + Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty,
>> + llvm_i64_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_ucmp_w_512: GCCBuiltin<"__builtin_ia32_ucmpw512_mask">,
>> + Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty,
>> + llvm_i32_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_ucmp_d_512: GCCBuiltin<"__builtin_ia32_ucmpd512_mask">,
>> + Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
>> + llvm_i16_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_ucmp_q_512: GCCBuiltin<"__builtin_ia32_ucmpq512_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> +
>> // 256-bit
>> def int_x86_avx512_mask_pcmpeq_b_256 : GCCBuiltin<"__builtin_ia32_pcmpeqb256_mask">,
>> Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
>> @@ -3290,6 +3316,32 @@ let TargetPrefix = "x86" in {
>> Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
>> [IntrNoMem]>;
>>
>> + def int_x86_avx512_mask_cmp_b_256: GCCBuiltin<"__builtin_ia32_cmpb256_mask">,
>> + Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty,
>> + llvm_i32_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_cmp_w_256: GCCBuiltin<"__builtin_ia32_cmpw256_mask">,
>> + Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty,
>> + llvm_i16_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_cmp_d_256: GCCBuiltin<"__builtin_ia32_cmpd256_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_cmp_q_256: GCCBuiltin<"__builtin_ia32_cmpq256_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> +
>> + def int_x86_avx512_mask_ucmp_b_256: GCCBuiltin<"__builtin_ia32_ucmpb256_mask">,
>> + Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty,
>> + llvm_i32_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_ucmp_w_256: GCCBuiltin<"__builtin_ia32_ucmpw256_mask">,
>> + Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty,
>> + llvm_i16_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_ucmp_d_256: GCCBuiltin<"__builtin_ia32_ucmpd256_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_ucmp_q_256: GCCBuiltin<"__builtin_ia32_ucmpq256_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> +
>> // 128-bit
>> def int_x86_avx512_mask_pcmpeq_b_128 : GCCBuiltin<"__builtin_ia32_pcmpeqb128_mask">,
>> Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
>> @@ -3316,6 +3368,32 @@ let TargetPrefix = "x86" in {
>> def int_x86_avx512_mask_pcmpgt_q_128: GCCBuiltin<"__builtin_ia32_pcmpgtq128_mask">,
>> Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
>> [IntrNoMem]>;
>> +
>> + def int_x86_avx512_mask_cmp_b_128: GCCBuiltin<"__builtin_ia32_cmpb128_mask">,
>> + Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
>> + llvm_i16_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_cmp_w_128: GCCBuiltin<"__builtin_ia32_cmpw128_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_cmp_d_128: GCCBuiltin<"__builtin_ia32_cmpd128_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_cmp_q_128: GCCBuiltin<"__builtin_ia32_cmpq128_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> +
>> + def int_x86_avx512_mask_ucmp_b_128: GCCBuiltin<"__builtin_ia32_ucmpb128_mask">,
>> + Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
>> + llvm_i16_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_ucmp_w_128: GCCBuiltin<"__builtin_ia32_ucmpw128_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_ucmp_d_128: GCCBuiltin<"__builtin_ia32_ucmpd128_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> + def int_x86_avx512_mask_ucmp_q_128: GCCBuiltin<"__builtin_ia32_ucmpq128_mask">,
>> + Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
>> + llvm_i8_ty], [IntrNoMem]>;
>> }
>>
>> // Misc.
>>
>> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=219316&r1=219315&r2=219316&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
>> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Oct 8 10:49:26 2014
>> @@ -16194,7 +16194,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(S
>> case INTR_TYPE_3OP:
>> return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
>> Op.getOperand(2), Op.getOperand(3));
>> - case CMP_MASK: {
>> + case CMP_MASK:
>> + case CMP_MASK_CC: {
>> // Comparison intrinsics with masks.
>> // Example of transformation:
>> // (i8 (int_x86_avx512_mask_pcmpeq_q_128
>> @@ -16207,12 +16208,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(S
>> EVT VT = Op.getOperand(1).getValueType();
>> EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
>> VT.getVectorNumElements());
>> - SDValue Mask = Op.getOperand(3);
>> + SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
>> EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
>> Mask.getValueType().getSizeInBits());
>> - SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT,
>> - Op.getOperand(1), Op.getOperand(2));
>> - SDValue CmpMask = getVectorMaskingNode(Cmp, Op.getOperand(3),
>> + SDValue Cmp;
>> + if (IntrData->Type == CMP_MASK_CC) {
>> + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
>> + Op.getOperand(2), Op.getOperand(3));
>> + } else {
>> + assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
>> + Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
>> + Op.getOperand(2));
>> + }
>> + SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
>> DAG.getTargetConstant(0, MaskVT), DAG);
>> SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
>> DAG.getUNDEF(BitcastVT), CmpMask,
>>
>> Modified: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h?rev=219316&r1=219315&r2=219316&view=diff
>> ==============================================================================
>> --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h (original)
>> +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h Wed Oct 8 10:49:26 2014
>> @@ -20,7 +20,7 @@ enum IntrinsicType {
>> INTR_NO_TYPE,
>> GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
>> INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
>> - CMP_MASK, VSHIFT, COMI
>> + CMP_MASK, CMP_MASK_CC, VSHIFT, COMI
>> };
>>
>> struct IntrinsicData {
>> @@ -156,6 +156,18 @@ static const IntrinsicData IntrinsicsWi
>> X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
>> X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
>> X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_b_128, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_b_256, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_b_512, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_d_128, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_d_256, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_d_512, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_q_128, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_q_256, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_q_512, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_w_128, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_w_256, CMP_MASK_CC, X86ISD::CMPM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_cmp_w_512, CMP_MASK_CC, X86ISD::CMPM, 0),
>> X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128, CMP_MASK, X86ISD::PCMPEQM, 0),
>> X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256, CMP_MASK, X86ISD::PCMPEQM, 0),
>> X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512, CMP_MASK, X86ISD::PCMPEQM, 0),
>> @@ -180,6 +192,18 @@ static const IntrinsicData IntrinsicsWi
>> X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128, CMP_MASK, X86ISD::PCMPGTM, 0),
>> X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256, CMP_MASK, X86ISD::PCMPGTM, 0),
>> X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512, CMP_MASK, X86ISD::PCMPGTM, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> + X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
>> X86_INTRINSIC_DATA(avx_hadd_pd_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
>> X86_INTRINSIC_DATA(avx_hadd_ps_256, INTR_TYPE_2OP, X86ISD::FHADD, 0),
>> X86_INTRINSIC_DATA(avx_hsub_pd_256, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
>>
>> Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=219316&r1=219315&r2=219316&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Wed Oct 8 10:49:26 2014
>> @@ -709,3 +709,243 @@ define i8 @test_mask_pcmpgt_q(<8 x i64>
>> }
>>
>> declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
>> +
>> +define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
>> +; CHECK_LABEL: test_cmp_d_512
>> +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##
>
> Please don’t add any more tests assuming —show-mc-encoding. That is going away pretty soon. {{\n}} or some such can check for EOL.
I guess we have a chicken-and-egg problem here. So don’t worry about this. I’ll fix this when I remove —show-mc-encoding.
> Adam
>
>> + %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltd %zmm1, %zmm0, %k0 ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpled %zmm1, %zmm0, %k0 ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnled %zmm1, %zmm0, %k0 ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordd %zmm1, %zmm0, %k0 ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_d_512
>> +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltd %zmm1, %zmm0, %k0 {%k1} ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpled %zmm1, %zmm0, %k0 {%k1} ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 {%k1} ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnled %zmm1, %zmm0, %k0 {%k1} ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordd %zmm1, %zmm0, %k0 {%k1} ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
>> +
>> +define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
>> +; CHECK_LABEL: test_ucmp_d_512
>> +; CHECK: vpcmpequd %zmm1, %zmm0, %k0 ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltud %zmm1, %zmm0, %k0 ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpleud %zmm1, %zmm0, %k0 ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordud %zmm1, %zmm0, %k0 ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_d_512
>> +; CHECK: vpcmpequd %zmm1, %zmm0, %k0 {%k1} ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 {%k1} ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 {%k1} ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordud %zmm1, %zmm0, %k0 {%k1} ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
>> +
>> +define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
>> +; CHECK_LABEL: test_cmp_q_512
>> +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltq %zmm1, %zmm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleq %zmm1, %zmm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordq %zmm1, %zmm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_q_512
>> +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
>> +
>> +define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
>> +; CHECK_LABEL: test_ucmp_q_512
>> +; CHECK: vpcmpequq %zmm1, %zmm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmporduq %zmm1, %zmm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_q_512
>> +; CHECK: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmporduq %zmm1, %zmm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
>>
>> Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll?rev=219316&r1=219315&r2=219316&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll Wed Oct 8 10:49:26 2014
>> @@ -63,3 +63,243 @@ define i32 @test_mask_pcmpgt_w(<32 x i16
>> }
>>
>> declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
>> +
>> +define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
>> +; CHECK_LABEL: test_cmp_b_512
>> +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
>> + %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
>> + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
>> +; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
>> + %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
>> + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
>> +; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
>> + %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
>> + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
>> +; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
>> + %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
>> + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
>> +; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
>> + %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
>> + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
>> +; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
>> + %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
>> + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
>> +; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
>> + %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
>> + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
>> +; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
>> + %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
>> + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
>> + ret <8 x i64> %vec7
>> +}
>> +
>> +define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_b_512
>> +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
>> + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
>> +; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
>> + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
>> +; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
>> + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
>> +; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
>> + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
>> +; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
>> + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
>> +; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
>> + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
>> +; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
>> + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
>> +; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
>> + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
>> + ret <8 x i64> %vec7
>> +}
>> +
>> +declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
>> +
>> +define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
>> +; CHECK_LABEL: test_ucmp_b_512
>> +; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
>> + %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
>> + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
>> +; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
>> + %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
>> + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
>> +; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
>> + %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
>> + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
>> +; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
>> + %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
>> + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
>> +; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
>> + %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
>> + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
>> +; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
>> + %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
>> + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
>> +; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
>> + %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
>> + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
>> +; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
>> + %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
>> + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
>> + ret <8 x i64> %vec7
>> +}
>> +
>> +define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_b_512
>> +; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
>> + %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
>> +; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
>> + %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
>> + %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
>> +; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
>> + %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
>> + %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
>> +; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
>> + %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
>> + %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
>> +; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
>> + %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
>> + %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
>> +; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
>> + %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
>> + %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
>> +; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
>> + %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
>> + %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
>> +; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
>> + %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
>> + %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
>> + ret <8 x i64> %vec7
>> +}
>> +
>> +declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
>> +
>> +define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
>> +; CHECK_LABEL: test_cmp_w_512
>> +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
>> + %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
>> + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
>> +; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
>> + %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
>> + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
>> +; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
>> + %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
>> + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
>> +; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
>> + %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
>> + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
>> +; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
>> + %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
>> + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
>> +; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
>> + %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
>> + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
>> +; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
>> + %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
>> + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
>> +; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
>> + %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
>> + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
>> + ret <8 x i32> %vec7
>> +}
>> +
>> +define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_w_512
>> +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
>> + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
>> +; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
>> + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
>> +; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
>> + %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
>> + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
>> +; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
>> + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
>> +; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
>> + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
>> +; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
>> + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
>> +; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
>> + %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
>> + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
>> +; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
>> + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
>> + ret <8 x i32> %vec7
>> +}
>> +
>> +declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
>> +
>> +define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
>> +; CHECK_LABEL: test_ucmp_w_512
>> +; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
>> + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
>> + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
>> +; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
>> + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
>> + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
>> +; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
>> + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
>> + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
>> +; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
>> + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
>> + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
>> +; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
>> + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
>> + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
>> +; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
>> + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
>> + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
>> +; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
>> + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
>> + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
>> +; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
>> + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
>> + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
>> + ret <8 x i32> %vec7
>> +}
>> +
>> +define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_w_512
>> +; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
>> + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
>> +; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
>> + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
>> +; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
>> + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
>> +; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
>> + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
>> +; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
>> + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
>> +; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
>> + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
>> +; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
>> + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
>> +; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
>> + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
>> + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
>> + ret <8 x i32> %vec7
>> +}
>> +
>> +declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
>>
>> Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll?rev=219316&r1=219315&r2=219316&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll Wed Oct 8 10:49:26 2014
>> @@ -66,6 +66,246 @@ define i16 @test_mask_pcmpgt_w_256(<16 x
>>
>> declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
>>
>> +define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
>> +; CHECK_LABEL: test_cmp_b_256
>> +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
>> + %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
>> + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
>> +; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
>> + %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
>> + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
>> +; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
>> + %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
>> + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
>> +; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
>> + %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
>> + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
>> +; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
>> + %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
>> + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
>> +; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
>> + %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
>> + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
>> +; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
>> + %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
>> + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
>> +; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
>> + %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
>> + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
>> + ret <8 x i32> %vec7
>> +}
>> +
>> +define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_b_256
>> +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
>> + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
>> +; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
>> + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
>> +; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
>> + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
>> +; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
>> + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
>> +; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
>> + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
>> +; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
>> + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
>> +; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
>> + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
>> +; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
>> + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
>> + ret <8 x i32> %vec7
>> +}
>> +
>> +declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
>> +
>> +define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
>> +; CHECK_LABEL: test_ucmp_b_256
>> +; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
>> + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
>> + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
>> +; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
>> + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
>> + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
>> +; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
>> + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
>> + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
>> +; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
>> + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
>> + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
>> +; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
>> + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
>> + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
>> +; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
>> + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
>> + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
>> +; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
>> + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
>> + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
>> +; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
>> + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
>> + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
>> + ret <8 x i32> %vec7
>> +}
>> +
>> +define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_b_256
>> +; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
>> + %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
>> +; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
>> + %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
>> + %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
>> +; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
>> + %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
>> + %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
>> +; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
>> + %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
>> + %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
>> +; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
>> + %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
>> + %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
>> +; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
>> + %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
>> + %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
>> +; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
>> + %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
>> + %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
>> +; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
>> + %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
>> + %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
>> + ret <8 x i32> %vec7
>> +}
>> +
>> +declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
>> +
>> +define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
>> +; CHECK_LABEL: test_cmp_w_256
>> +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_w_256
>> +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
>> +
>> +define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
>> +; CHECK_LABEL: test_ucmp_w_256
>> +; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_w_256
>> +; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
>> +
>> ; 128-bit
>>
>> define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
>> @@ -131,3 +371,243 @@ define i8 @test_mask_pcmpgt_w_128(<8 x i
>> }
>>
>> declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
>> +
>> +define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
>> +; CHECK_LABEL: test_cmp_b_128
>> +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_b_128
>> +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
>> +
>> +define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
>> +; CHECK_LABEL: test_ucmp_b_128
>> +; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_b_128
>> +; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
>> + %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
>> +; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
>> + %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
>> + %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
>> +; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
>> + %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
>> + %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
>> +; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
>> + %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
>> + %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
>> +; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
>> + %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
>> + %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
>> +; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
>> + %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
>> + %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
>> +; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
>> + %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
>> + %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
>> +; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
>> + %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
>> + %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
>> + ret <8 x i16> %vec7
>> +}
>> +
>> +declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
>> +
>> +define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
>> +; CHECK_LABEL: test_cmp_w_128
>> +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_w_128
>> +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
>> +
>> +define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
>> +; CHECK_LABEL: test_ucmp_w_128
>> +; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_w_128
>> +; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
>>
>> Modified: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
>> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll?rev=219316&r1=219315&r2=219316&view=diff
>> ==============================================================================
>> --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll (original)
>> +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll Wed Oct 8 10:49:26 2014
>> @@ -66,6 +66,246 @@ define i8 @test_mask_pcmpgt_q_256(<4 x i
>>
>> declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
>>
>> +define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
>> +; CHECK_LABEL: test_cmp_d_256
>> +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_d_256
>> +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
>> +
>> +define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
>> +; CHECK_LABEL: test_ucmp_d_256
>> +; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_d_256
>> +; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
>> +
>> +define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
>> +; CHECK_LABEL: test_cmp_q_256
>> +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_q_256
>> +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
>> +
>> +define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
>> +; CHECK_LABEL: test_ucmp_q_256
>> +; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_q_256
>> +; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
>> +
>> ; 128-bit
>>
>> define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
>> @@ -131,3 +371,243 @@ define i8 @test_mask_pcmpgt_q_128(<2 x i
>> }
>>
>> declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
>> +
>> +define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
>> +; CHECK_LABEL: test_cmp_d_128
>> +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_d_128
>> +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
>> +
>> +define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
>> +; CHECK_LABEL: test_ucmp_d_128
>> +; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_d_128
>> +; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
>> +
>> +define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
>> +; CHECK_LABEL: test_cmp_q_128
>> +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_cmp_q_128
>> +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
>> +
>> +define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
>> +; CHECK_LABEL: test_ucmp_q_128
>> +; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
>> +; CHECK_LABEL: test_mask_ucmp_q_128
>> +; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
>> + %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
>> +; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
>> + %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
>> +; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
>> + %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
>> +; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
>> + %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
>> +; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
>> + %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
>> +; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
>> + %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
>> +; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
>> + %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
>> +; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
>> + %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
>> + %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
>> + ret <8 x i8> %vec7
>> +}
>> +
>> +declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
>>
>>
>> _______________________________________________
>> llvm-commits mailing list
>> llvm-commits at cs.uiuc.edu
>> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
>
More information about the llvm-commits
mailing list