[llvm] r219316 - [AVX512] Added intrinsics for 128-, 256- and 512-bit versions of VPCMP/VPCMPU{BWDQ}

Adam Nemet anemet at apple.com
Wed Oct 8 10:49:18 PDT 2014


On Oct 8, 2014, at 8:49 AM, Robert Khasanov <rob.khasanov at gmail.com> wrote:

> Author: rkhasanov
> Date: Wed Oct  8 10:49:26 2014
> New Revision: 219316
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=219316&view=rev
> Log:
> [AVX512] Added intrinsics for 128-, 256- and 512-bit versions of VPCMP/VPCMPU{BWDQ}
> Added CMP_MASK_CC intrinsic type.
> Added tests for intrinsics.
> 
> Patch by Sergey Lisitsyn <sergey.lisitsyn at intel.com>
> 
> Modified:
>    llvm/trunk/include/llvm/IR/IntrinsicsX86.td
>    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
>    llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
>    llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
>    llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
>    llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
>    llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
> 
> Modified: llvm/trunk/include/llvm/IR/IntrinsicsX86.td
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsX86.td?rev=219316&r1=219315&r2=219316&view=diff
> ==============================================================================
> --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td (original)
> +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td Wed Oct  8 10:49:26 2014
> @@ -3263,6 +3263,32 @@ let TargetPrefix = "x86" in {
>         Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
>                   [IntrNoMem]>;
> 
> +  def int_x86_avx512_mask_cmp_b_512: GCCBuiltin<"__builtin_ia32_cmpb512_mask">,
> +        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty,
> +                  llvm_i64_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_cmp_w_512: GCCBuiltin<"__builtin_ia32_cmpw512_mask">,
> +        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty,
> +                  llvm_i32_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_cmp_d_512: GCCBuiltin<"__builtin_ia32_cmpd512_mask">,
> +        Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
> +                  llvm_i16_ty], [IntrNoMem ]>;
> +  def int_x86_avx512_mask_cmp_q_512: GCCBuiltin<"__builtin_ia32_cmpq512_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +
> +  def int_x86_avx512_mask_ucmp_b_512: GCCBuiltin<"__builtin_ia32_ucmpb512_mask">,
> +        Intrinsic<[llvm_i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty,
> +                  llvm_i64_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_ucmp_w_512: GCCBuiltin<"__builtin_ia32_ucmpw512_mask">,
> +        Intrinsic<[llvm_i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty,
> +                  llvm_i32_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_ucmp_d_512: GCCBuiltin<"__builtin_ia32_ucmpd512_mask">,
> +        Intrinsic<[llvm_i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty,
> +                  llvm_i16_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_ucmp_q_512: GCCBuiltin<"__builtin_ia32_ucmpq512_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +
>   // 256-bit
>   def int_x86_avx512_mask_pcmpeq_b_256 : GCCBuiltin<"__builtin_ia32_pcmpeqb256_mask">,
>         Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty],
> @@ -3290,6 +3316,32 @@ let TargetPrefix = "x86" in {
>         Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
>                   [IntrNoMem]>;
> 
> +  def int_x86_avx512_mask_cmp_b_256: GCCBuiltin<"__builtin_ia32_cmpb256_mask">,
> +        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty,
> +                  llvm_i32_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_cmp_w_256: GCCBuiltin<"__builtin_ia32_cmpw256_mask">,
> +        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty,
> +                  llvm_i16_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_cmp_d_256: GCCBuiltin<"__builtin_ia32_cmpd256_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_cmp_q_256: GCCBuiltin<"__builtin_ia32_cmpq256_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +
> +  def int_x86_avx512_mask_ucmp_b_256: GCCBuiltin<"__builtin_ia32_ucmpb256_mask">,
> +        Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty,
> +                  llvm_i32_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_ucmp_w_256: GCCBuiltin<"__builtin_ia32_ucmpw256_mask">,
> +        Intrinsic<[llvm_i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty,
> +                  llvm_i16_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_ucmp_d_256: GCCBuiltin<"__builtin_ia32_ucmpd256_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_ucmp_q_256: GCCBuiltin<"__builtin_ia32_ucmpq256_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +
>   // 128-bit
>   def int_x86_avx512_mask_pcmpeq_b_128 : GCCBuiltin<"__builtin_ia32_pcmpeqb128_mask">,
>         Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty],
> @@ -3316,6 +3368,32 @@ let TargetPrefix = "x86" in {
>   def int_x86_avx512_mask_pcmpgt_q_128: GCCBuiltin<"__builtin_ia32_pcmpgtq128_mask">,
>         Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
>                   [IntrNoMem]>;
> +
> +  def int_x86_avx512_mask_cmp_b_128: GCCBuiltin<"__builtin_ia32_cmpb128_mask">,
> +        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
> +                  llvm_i16_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_cmp_w_128: GCCBuiltin<"__builtin_ia32_cmpw128_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_cmp_d_128: GCCBuiltin<"__builtin_ia32_cmpd128_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_cmp_q_128: GCCBuiltin<"__builtin_ia32_cmpq128_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +
> +  def int_x86_avx512_mask_ucmp_b_128: GCCBuiltin<"__builtin_ia32_ucmpb128_mask">,
> +        Intrinsic<[llvm_i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty,
> +                  llvm_i16_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_ucmp_w_128: GCCBuiltin<"__builtin_ia32_ucmpw128_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_ucmp_d_128: GCCBuiltin<"__builtin_ia32_ucmpd128_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> +  def int_x86_avx512_mask_ucmp_q_128: GCCBuiltin<"__builtin_ia32_ucmpq128_mask">,
> +        Intrinsic<[llvm_i8_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty,
> +                  llvm_i8_ty], [IntrNoMem]>;
> }
> 
> // Misc.
> 
> Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=219316&r1=219315&r2=219316&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
> +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Oct  8 10:49:26 2014
> @@ -16194,7 +16194,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(S
>     case INTR_TYPE_3OP:
>       return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
>         Op.getOperand(2), Op.getOperand(3));
> -    case CMP_MASK: {
> +    case CMP_MASK:
> +    case CMP_MASK_CC: {
>       // Comparison intrinsics with masks.
>       // Example of transformation:
>       // (i8 (int_x86_avx512_mask_pcmpeq_q_128
> @@ -16207,12 +16208,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(S
>       EVT VT = Op.getOperand(1).getValueType();
>       EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
>                                     VT.getVectorNumElements());
> -      SDValue Mask = Op.getOperand(3);
> +      SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
>       EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
>                                        Mask.getValueType().getSizeInBits());
> -      SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT,
> -                                Op.getOperand(1), Op.getOperand(2));
> -      SDValue CmpMask = getVectorMaskingNode(Cmp, Op.getOperand(3),
> +      SDValue Cmp;
> +      if (IntrData->Type == CMP_MASK_CC) {
> +        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
> +                    Op.getOperand(2), Op.getOperand(3));
> +      } else {
> +        assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
> +        Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
> +                    Op.getOperand(2));
> +      }
> +      SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
>                                         DAG.getTargetConstant(0, MaskVT), DAG);
>       SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
>                                 DAG.getUNDEF(BitcastVT), CmpMask,
> 
> Modified: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h?rev=219316&r1=219315&r2=219316&view=diff
> ==============================================================================
> --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h (original)
> +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h Wed Oct  8 10:49:26 2014
> @@ -20,7 +20,7 @@ enum IntrinsicType {
>   INTR_NO_TYPE,
>   GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
>   INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
> -  CMP_MASK, VSHIFT, COMI
> +  CMP_MASK, CMP_MASK_CC, VSHIFT, COMI
> };
> 
> struct IntrinsicData {
> @@ -156,6 +156,18 @@ static const IntrinsicData  IntrinsicsWi
>   X86_INTRINSIC_DATA(avx2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
>   X86_INTRINSIC_DATA(avx2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
>   X86_INTRINSIC_DATA(avx2_vperm2i128,   INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_b_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_b_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_b_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_d_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_d_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_d_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_q_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_q_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_q_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_w_128,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_w_256,     CMP_MASK_CC,  X86ISD::CMPM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_cmp_w_512,     CMP_MASK_CC,  X86ISD::CMPM, 0),
>   X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_128,  CMP_MASK,  X86ISD::PCMPEQM, 0),
>   X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_256,  CMP_MASK,  X86ISD::PCMPEQM, 0),
>   X86_INTRINSIC_DATA(avx512_mask_pcmpeq_b_512,  CMP_MASK,  X86ISD::PCMPEQM, 0),
> @@ -180,6 +192,18 @@ static const IntrinsicData  IntrinsicsWi
>   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_128,  CMP_MASK,  X86ISD::PCMPGTM, 0),
>   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_256,  CMP_MASK,  X86ISD::PCMPGTM, 0),
>   X86_INTRINSIC_DATA(avx512_mask_pcmpgt_w_512,  CMP_MASK,  X86ISD::PCMPGTM, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_d_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_q_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
> +  X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
>   X86_INTRINSIC_DATA(avx_hadd_pd_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
>   X86_INTRINSIC_DATA(avx_hadd_ps_256,   INTR_TYPE_2OP, X86ISD::FHADD, 0),
>   X86_INTRINSIC_DATA(avx_hsub_pd_256,   INTR_TYPE_2OP, X86ISD::FHSUB, 0),
> 
> Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=219316&r1=219315&r2=219316&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Wed Oct  8 10:49:26 2014
> @@ -709,3 +709,243 @@ define i8 @test_mask_pcmpgt_q(<8 x i64>
> }
> 
> declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
> +
> +define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
> +; CHECK_LABEL: test_cmp_d_512
> +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 ##

Please don’t add any more tests assuming —show-mc-encoding.  That is going away pretty soon.  {{\n}} or some such can check for EOL.

Adam

> +  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltd %zmm1, %zmm0, %k0 ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpled %zmm1, %zmm0, %k0 ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnled %zmm1, %zmm0, %k0 ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordd %zmm1, %zmm0, %k0 ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
> +; CHECK_LABEL: test_mask_cmp_d_512
> +; CHECK: vpcmpeqd %zmm1, %zmm0, %k0 {%k1} ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltd %zmm1, %zmm0, %k0 {%k1} ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpled %zmm1, %zmm0, %k0 {%k1} ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordd %zmm1, %zmm0, %k0 {%k1} ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpneqd %zmm1, %zmm0, %k0 {%k1} ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltd %zmm1, %zmm0, %k0 {%k1} ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnled %zmm1, %zmm0, %k0 {%k1} ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordd %zmm1, %zmm0, %k0 {%k1} ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
> +
> +define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
> +; CHECK_LABEL: test_ucmp_d_512
> +; CHECK: vpcmpequd %zmm1, %zmm0, %k0 ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltud %zmm1, %zmm0, %k0 ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpleud %zmm1, %zmm0, %k0 ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordud %zmm1, %zmm0, %k0 ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_d_512
> +; CHECK: vpcmpequd %zmm1, %zmm0, %k0 {%k1} ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltud %zmm1, %zmm0, %k0 {%k1} ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpleud %zmm1, %zmm0, %k0 {%k1} ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordud %zmm1, %zmm0, %k0 {%k1} ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpnequd %zmm1, %zmm0, %k0 {%k1} ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltud %zmm1, %zmm0, %k0 {%k1} ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnleud %zmm1, %zmm0, %k0 {%k1} ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordud %zmm1, %zmm0, %k0 {%k1} ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
> +
> +define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
> +; CHECK_LABEL: test_cmp_q_512
> +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltq %zmm1, %zmm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleq %zmm1, %zmm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordq %zmm1, %zmm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_cmp_q_512
> +; CHECK: vpcmpeqq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
> +
> +define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
> +; CHECK_LABEL: test_ucmp_q_512
> +; CHECK: vpcmpequq %zmm1, %zmm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmporduq %zmm1, %zmm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_q_512
> +; CHECK: vpcmpequq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltuq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleuq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunorduq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltuq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleuq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmporduq %zmm1, %zmm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
> 
> Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll?rev=219316&r1=219315&r2=219316&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll Wed Oct  8 10:49:26 2014
> @@ -63,3 +63,243 @@ define i32 @test_mask_pcmpgt_w(<32 x i16
> }
> 
> declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32)
> +
> +define <8 x i64> @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
> +; CHECK_LABEL: test_cmp_b_512
> +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 ##
> +  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
> +  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
> +; CHECK: vpcmpltb %zmm1, %zmm0, %k0 ##
> +  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
> +  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
> +; CHECK: vpcmpleb %zmm1, %zmm0, %k0 ##
> +  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
> +  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
> +; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 ##
> +  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
> +  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
> +; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 ##
> +  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
> +  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
> +; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 ##
> +  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
> +  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
> +; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 ##
> +  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
> +  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
> +; CHECK: vpcmpordb %zmm1, %zmm0, %k0 ##
> +  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
> +  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
> +  ret <8 x i64> %vec7
> +}
> +
> +define <8 x i64> @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
> +; CHECK_LABEL: test_mask_cmp_b_512
> +; CHECK: vpcmpeqb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
> +  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
> +; CHECK: vpcmpltb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res1 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
> +  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
> +; CHECK: vpcmpleb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res2 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
> +  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
> +; CHECK: vpcmpunordb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res3 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
> +  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
> +; CHECK: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res4 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
> +  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
> +; CHECK: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res5 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
> +  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
> +; CHECK: vpcmpnleb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res6 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
> +  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
> +; CHECK: vpcmpordb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res7 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
> +  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
> +  ret <8 x i64> %vec7
> +}
> +
> +declare i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
> +
> +define <8 x i64> @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) {
> +; CHECK_LABEL: test_ucmp_b_512
> +; CHECK: vpcmpequb %zmm1, %zmm0, %k0 ##
> +  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
> +  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
> +; CHECK: vpcmpltub %zmm1, %zmm0, %k0 ##
> +  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 -1)
> +  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
> +; CHECK: vpcmpleub %zmm1, %zmm0, %k0 ##
> +  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 -1)
> +  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
> +; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 ##
> +  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 -1)
> +  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
> +; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 ##
> +  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 -1)
> +  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
> +; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 ##
> +  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 -1)
> +  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
> +; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 ##
> +  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 -1)
> +  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
> +; CHECK: vpcmpordub %zmm1, %zmm0, %k0 ##
> +  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 -1)
> +  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
> +  ret <8 x i64> %vec7
> +}
> +
> +define <8 x i64> @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_b_512
> +; CHECK: vpcmpequb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
> +  %vec0 = insertelement <8 x i64> undef, i64 %res0, i32 0
> +; CHECK: vpcmpltub %zmm1, %zmm0, %k0 {%k1} ##
> +  %res1 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 1, i64 %mask)
> +  %vec1 = insertelement <8 x i64> %vec0, i64 %res1, i32 1
> +; CHECK: vpcmpleub %zmm1, %zmm0, %k0 {%k1} ##
> +  %res2 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 2, i64 %mask)
> +  %vec2 = insertelement <8 x i64> %vec1, i64 %res2, i32 2
> +; CHECK: vpcmpunordub %zmm1, %zmm0, %k0 {%k1} ##
> +  %res3 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 3, i64 %mask)
> +  %vec3 = insertelement <8 x i64> %vec2, i64 %res3, i32 3
> +; CHECK: vpcmpnequb %zmm1, %zmm0, %k0 {%k1} ##
> +  %res4 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 4, i64 %mask)
> +  %vec4 = insertelement <8 x i64> %vec3, i64 %res4, i32 4
> +; CHECK: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} ##
> +  %res5 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 5, i64 %mask)
> +  %vec5 = insertelement <8 x i64> %vec4, i64 %res5, i32 5
> +; CHECK: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} ##
> +  %res6 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 6, i64 %mask)
> +  %vec6 = insertelement <8 x i64> %vec5, i64 %res6, i32 6
> +; CHECK: vpcmpordub %zmm1, %zmm0, %k0 {%k1} ##
> +  %res7 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 7, i64 %mask)
> +  %vec7 = insertelement <8 x i64> %vec6, i64 %res7, i32 7
> +  ret <8 x i64> %vec7
> +}
> +
> +declare i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8>, <64 x i8>, i32, i64) nounwind readnone
> +
> +define <8 x i32> @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
> +; CHECK_LABEL: test_cmp_w_512
> +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 ##
> +  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
> +  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
> +; CHECK: vpcmpltw %zmm1, %zmm0, %k0 ##
> +  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
> +  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
> +; CHECK: vpcmplew %zmm1, %zmm0, %k0 ##
> +  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
> +  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
> +; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 ##
> +  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
> +  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
> +; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 ##
> +  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
> +  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
> +; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 ##
> +  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
> +  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
> +; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 ##
> +  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
> +  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
> +; CHECK: vpcmpordw %zmm1, %zmm0, %k0 ##
> +  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
> +  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
> +  ret <8 x i32> %vec7
> +}
> +
> +define <8 x i32> @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
> +; CHECK_LABEL: test_mask_cmp_w_512
> +; CHECK: vpcmpeqw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
> +  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
> +; CHECK: vpcmpltw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res1 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
> +  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
> +; CHECK: vpcmplew %zmm1, %zmm0, %k0 {%k1} ##
> +  %res2 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
> +  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
> +; CHECK: vpcmpunordw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res3 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
> +  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
> +; CHECK: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res4 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
> +  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
> +; CHECK: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res5 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
> +  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
> +; CHECK: vpcmpnlew %zmm1, %zmm0, %k0 {%k1} ##
> +  %res6 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
> +  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
> +; CHECK: vpcmpordw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res7 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
> +  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
> +  ret <8 x i32> %vec7
> +}
> +
> +declare i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
> +
> +define <8 x i32> @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) {
> +; CHECK_LABEL: test_ucmp_w_512
> +; CHECK: vpcmpequw %zmm1, %zmm0, %k0 ##
> +  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
> +  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
> +; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 ##
> +  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 -1)
> +  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
> +; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 ##
> +  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 -1)
> +  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
> +; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 ##
> +  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 -1)
> +  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
> +; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 ##
> +  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 -1)
> +  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
> +; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 ##
> +  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 -1)
> +  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
> +; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 ##
> +  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 -1)
> +  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
> +; CHECK: vpcmporduw %zmm1, %zmm0, %k0 ##
> +  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 -1)
> +  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
> +  ret <8 x i32> %vec7
> +}
> +
> +define <8 x i32> @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_w_512
> +; CHECK: vpcmpequw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
> +  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
> +; CHECK: vpcmpltuw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 1, i32 %mask)
> +  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
> +; CHECK: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 2, i32 %mask)
> +  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
> +; CHECK: vpcmpunorduw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 3, i32 %mask)
> +  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
> +; CHECK: vpcmpnequw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 4, i32 %mask)
> +  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
> +; CHECK: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 5, i32 %mask)
> +  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
> +; CHECK: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 6, i32 %mask)
> +  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
> +; CHECK: vpcmporduw %zmm1, %zmm0, %k0 {%k1} ##
> +  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 7, i32 %mask)
> +  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
> +  ret <8 x i32> %vec7
> +}
> +
> +declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
> 
> Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll?rev=219316&r1=219315&r2=219316&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll Wed Oct  8 10:49:26 2014
> @@ -66,6 +66,246 @@ define i16 @test_mask_pcmpgt_w_256(<16 x
> 
> declare i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16>, <16 x i16>, i16)
> 
> +define <8 x i32> @test_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
> +; CHECK_LABEL: test_cmp_b_256
> +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 ##
> +  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
> +  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
> +; CHECK: vpcmpltb %ymm1, %ymm0, %k0 ##
> +  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
> +  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
> +; CHECK: vpcmpleb %ymm1, %ymm0, %k0 ##
> +  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
> +  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
> +; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 ##
> +  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
> +  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
> +; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 ##
> +  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
> +  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
> +; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 ##
> +  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
> +  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
> +; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 ##
> +  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
> +  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
> +; CHECK: vpcmpordb %ymm1, %ymm0, %k0 ##
> +  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
> +  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
> +  ret <8 x i32> %vec7
> +}
> +
> +define <8 x i32> @test_mask_cmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
> +; CHECK_LABEL: test_mask_cmp_b_256
> +; CHECK: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res0 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
> +  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
> +; CHECK: vpcmpltb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res1 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
> +  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
> +; CHECK: vpcmpleb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res2 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
> +  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
> +; CHECK: vpcmpunordb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res3 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
> +  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
> +; CHECK: vpcmpneqb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res4 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
> +  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
> +; CHECK: vpcmpnltb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res5 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
> +  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
> +; CHECK: vpcmpnleb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res6 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
> +  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
> +; CHECK: vpcmpordb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res7 = call i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
> +  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
> +  ret <8 x i32> %vec7
> +}
> +
> +declare i32 @llvm.x86.avx512.mask.cmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
> +
> +define <8 x i32> @test_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1) {
> +; CHECK_LABEL: test_ucmp_b_256
> +; CHECK: vpcmpequb %ymm1, %ymm0, %k0 ##
> +  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 -1)
> +  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
> +; CHECK: vpcmpltub %ymm1, %ymm0, %k0 ##
> +  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 -1)
> +  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
> +; CHECK: vpcmpleub %ymm1, %ymm0, %k0 ##
> +  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 -1)
> +  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
> +; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 ##
> +  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 -1)
> +  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
> +; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 ##
> +  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 -1)
> +  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
> +; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 ##
> +  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 -1)
> +  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
> +; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 ##
> +  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 -1)
> +  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
> +; CHECK: vpcmpordub %ymm1, %ymm0, %k0 ##
> +  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 -1)
> +  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
> +  ret <8 x i32> %vec7
> +}
> +
> +define <8 x i32> @test_mask_ucmp_b_256(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_b_256
> +; CHECK: vpcmpequb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res0 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 0, i32 %mask)
> +  %vec0 = insertelement <8 x i32> undef, i32 %res0, i32 0
> +; CHECK: vpcmpltub %ymm1, %ymm0, %k0 {%k1} ##
> +  %res1 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 1, i32 %mask)
> +  %vec1 = insertelement <8 x i32> %vec0, i32 %res1, i32 1
> +; CHECK: vpcmpleub %ymm1, %ymm0, %k0 {%k1} ##
> +  %res2 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 2, i32 %mask)
> +  %vec2 = insertelement <8 x i32> %vec1, i32 %res2, i32 2
> +; CHECK: vpcmpunordub %ymm1, %ymm0, %k0 {%k1} ##
> +  %res3 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 3, i32 %mask)
> +  %vec3 = insertelement <8 x i32> %vec2, i32 %res3, i32 3
> +; CHECK: vpcmpnequb %ymm1, %ymm0, %k0 {%k1} ##
> +  %res4 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 4, i32 %mask)
> +  %vec4 = insertelement <8 x i32> %vec3, i32 %res4, i32 4
> +; CHECK: vpcmpnltub %ymm1, %ymm0, %k0 {%k1} ##
> +  %res5 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 5, i32 %mask)
> +  %vec5 = insertelement <8 x i32> %vec4, i32 %res5, i32 5
> +; CHECK: vpcmpnleub %ymm1, %ymm0, %k0 {%k1} ##
> +  %res6 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 6, i32 %mask)
> +  %vec6 = insertelement <8 x i32> %vec5, i32 %res6, i32 6
> +; CHECK: vpcmpordub %ymm1, %ymm0, %k0 {%k1} ##
> +  %res7 = call i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> %a0, <32 x i8> %a1, i32 7, i32 %mask)
> +  %vec7 = insertelement <8 x i32> %vec6, i32 %res7, i32 7
> +  ret <8 x i32> %vec7
> +}
> +
> +declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nounwind readnone
> +
> +define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
> +; CHECK_LABEL: test_cmp_w_256
> +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltw %ymm1, %ymm0, %k0 ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmplew %ymm1, %ymm0, %k0 ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordw %ymm1, %ymm0, %k0 ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
> +; CHECK_LABEL: test_mask_cmp_w_256
> +; CHECK: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmplew %ymm1, %ymm0, %k0 {%k1} ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpneqw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnlew %ymm1, %ymm0, %k0 {%k1} ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
> +
> +define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
> +; CHECK_LABEL: test_ucmp_w_256
> +; CHECK: vpcmpequw %ymm1, %ymm0, %k0 ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 -1)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 -1)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 -1)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 -1)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 -1)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 -1)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmporduw %ymm1, %ymm0, %k0 ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 -1)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_w_256
> +; CHECK: vpcmpequw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltuw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 1, i16 %mask)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpleuw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 2, i16 %mask)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunorduw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 3, i16 %mask)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpnequw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 4, i16 %mask)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltuw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 5, i16 %mask)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnleuw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 6, i16 %mask)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmporduw %ymm1, %ymm0, %k0 {%k1} ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 7, i16 %mask)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +declare i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16>, <16 x i16>, i32, i16) nounwind readnone
> +
> ; 128-bit
> 
> define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
> @@ -131,3 +371,243 @@ define i8 @test_mask_pcmpgt_w_128(<8 x i
> }
> 
> declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
> +
> +define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
> +; CHECK_LABEL: test_cmp_b_128
> +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltb %xmm1, %xmm0, %k0 ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpleb %xmm1, %xmm0, %k0 ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordb %xmm1, %xmm0, %k0 ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
> +; CHECK_LABEL: test_mask_cmp_b_128
> +; CHECK: vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpleb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpneqb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnleb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
> +
> +define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
> +; CHECK_LABEL: test_ucmp_b_128
> +; CHECK: vpcmpequb %xmm1, %xmm0, %k0 ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltub %xmm1, %xmm0, %k0 ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 -1)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpleub %xmm1, %xmm0, %k0 ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 -1)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 -1)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 -1)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 -1)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 -1)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordub %xmm1, %xmm0, %k0 ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 -1)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_b_128
> +; CHECK: vpcmpequb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
> +  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
> +; CHECK: vpcmpltub %xmm1, %xmm0, %k0 {%k1} ##
> +  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 1, i16 %mask)
> +  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
> +; CHECK: vpcmpleub %xmm1, %xmm0, %k0 {%k1} ##
> +  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 2, i16 %mask)
> +  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
> +; CHECK: vpcmpunordub %xmm1, %xmm0, %k0 {%k1} ##
> +  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 3, i16 %mask)
> +  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
> +; CHECK: vpcmpnequb %xmm1, %xmm0, %k0 {%k1} ##
> +  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 4, i16 %mask)
> +  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
> +; CHECK: vpcmpnltub %xmm1, %xmm0, %k0 {%k1} ##
> +  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 5, i16 %mask)
> +  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
> +; CHECK: vpcmpnleub %xmm1, %xmm0, %k0 {%k1} ##
> +  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 6, i16 %mask)
> +  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
> +; CHECK: vpcmpordub %xmm1, %xmm0, %k0 {%k1} ##
> +  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 7, i16 %mask)
> +  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
> +  ret <8 x i16> %vec7
> +}
> +
> +declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nounwind readnone
> +
> +define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
> +; CHECK_LABEL: test_cmp_w_128
> +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltw %xmm1, %xmm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmplew %xmm1, %xmm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordw %xmm1, %xmm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_cmp_w_128
> +; CHECK: vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmplew %xmm1, %xmm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnlew %xmm1, %xmm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
> +
> +define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
> +; CHECK_LABEL: test_ucmp_w_128
> +; CHECK: vpcmpequw %xmm1, %xmm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmporduw %xmm1, %xmm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_w_128
> +; CHECK: vpcmpequw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltuw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleuw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunorduw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltuw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleuw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmporduw %xmm1, %xmm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
> 
> Modified: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
> URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll?rev=219316&r1=219315&r2=219316&view=diff
> ==============================================================================
> --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll (original)
> +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll Wed Oct  8 10:49:26 2014
> @@ -66,6 +66,246 @@ define i8 @test_mask_pcmpgt_q_256(<4 x i
> 
> declare i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64>, <4 x i64>, i8)
> 
> +define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
> +; CHECK_LABEL: test_cmp_d_256
> +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltd %ymm1, %ymm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpled %ymm1, %ymm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnled %ymm1, %ymm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordd %ymm1, %ymm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_cmp_d_256
> +; CHECK: vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltd %ymm1, %ymm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpled %ymm1, %ymm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordd %ymm1, %ymm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqd %ymm1, %ymm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltd %ymm1, %ymm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnled %ymm1, %ymm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordd %ymm1, %ymm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
> +
> +define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
> +; CHECK_LABEL: test_ucmp_d_256
> +; CHECK: vpcmpequd %ymm1, %ymm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltud %ymm1, %ymm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleud %ymm1, %ymm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordud %ymm1, %ymm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_d_256
> +; CHECK: vpcmpequd %ymm1, %ymm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltud %ymm1, %ymm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleud %ymm1, %ymm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordud %ymm1, %ymm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequd %ymm1, %ymm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltud %ymm1, %ymm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleud %ymm1, %ymm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordud %ymm1, %ymm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwind readnone
> +
> +define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
> +; CHECK_LABEL: test_cmp_q_256
> +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltq %ymm1, %ymm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleq %ymm1, %ymm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordq %ymm1, %ymm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_cmp_q_256
> +; CHECK: vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
> +
> +define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
> +; CHECK_LABEL: test_ucmp_q_256
> +; CHECK: vpcmpequq %ymm1, %ymm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmporduq %ymm1, %ymm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_q_256
> +; CHECK: vpcmpequq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltuq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleuq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunorduq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltuq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleuq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmporduq %ymm1, %ymm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwind readnone
> +
> ; 128-bit
> 
> define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
> @@ -131,3 +371,243 @@ define i8 @test_mask_pcmpgt_q_128(<2 x i
> }
> 
> declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8)
> +
> +define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
> +; CHECK_LABEL: test_cmp_d_128
> +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltd %xmm1, %xmm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpled %xmm1, %xmm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnled %xmm1, %xmm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordd %xmm1, %xmm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_cmp_d_128
> +; CHECK: vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltd %xmm1, %xmm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpled %xmm1, %xmm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordd %xmm1, %xmm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqd %xmm1, %xmm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltd %xmm1, %xmm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnled %xmm1, %xmm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordd %xmm1, %xmm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
> +
> +define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
> +; CHECK_LABEL: test_ucmp_d_128
> +; CHECK: vpcmpequd %xmm1, %xmm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltud %xmm1, %xmm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleud %xmm1, %xmm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordud %xmm1, %xmm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_d_128
> +; CHECK: vpcmpequd %xmm1, %xmm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltud %xmm1, %xmm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleud %xmm1, %xmm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordud %xmm1, %xmm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequd %xmm1, %xmm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltud %xmm1, %xmm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleud %xmm1, %xmm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordud %xmm1, %xmm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwind readnone
> +
> +define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
> +; CHECK_LABEL: test_cmp_q_128
> +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltq %xmm1, %xmm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleq %xmm1, %xmm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordq %xmm1, %xmm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_cmp_q_128
> +; CHECK: vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunordq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpneqq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmpordq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
> +
> +define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
> +; CHECK_LABEL: test_ucmp_q_128
> +; CHECK: vpcmpequq %xmm1, %xmm0, %k0 ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 -1)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 -1)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 -1)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 -1)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 -1)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 -1)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmporduq %xmm1, %xmm0, %k0 ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 -1)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
> +; CHECK_LABEL: test_mask_ucmp_q_128
> +; CHECK: vpcmpequq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
> +  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
> +; CHECK: vpcmpltuq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 1, i8 %mask)
> +  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
> +; CHECK: vpcmpleuq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 2, i8 %mask)
> +  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
> +; CHECK: vpcmpunorduq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 3, i8 %mask)
> +  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
> +; CHECK: vpcmpnequq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 4, i8 %mask)
> +  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
> +; CHECK: vpcmpnltuq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 5, i8 %mask)
> +  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
> +; CHECK: vpcmpnleuq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 6, i8 %mask)
> +  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
> +; CHECK: vpcmporduq %xmm1, %xmm0, %k0 {%k1} ##
> +  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 7, i8 %mask)
> +  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
> +  ret <8 x i8> %vec7
> +}
> +
> +declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwind readnone
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits





More information about the llvm-commits mailing list