[llvm] r333346 - [X86] Remove masking from avx512ifma intrinsics. Use a select instead.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Sat May 26 11:55:19 PDT 2018


Author: ctopper
Date: Sat May 26 11:55:19 2018
New Revision: 333346

URL: http://llvm.org/viewvc/llvm-project?rev=333346&view=rev
Log:
[X86] Remove masking from avx512ifma intrinsics. Use a select instead.

This allows us to avoid having mask and maskz variant. Reducing from 12 intrinsics to 6.

Added:
    llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll
    llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll
      - copied, changed from r333344, llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll
    llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll
      - copied, changed from r333344, llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
Modified:
    llvm/trunk/include/llvm/IR/IntrinsicsX86.td
    llvm/trunk/lib/IR/AutoUpgrade.cpp
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
    llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll

Modified: llvm/trunk/include/llvm/IR/IntrinsicsX86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsX86.td?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsX86.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td Sat May 26 11:55:19 2018
@@ -2657,54 +2657,30 @@ let TargetPrefix = "x86" in {  // All in
           [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty,  llvm_i16_ty,
           llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_vpmadd52h_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq128_mask">,
+  def int_x86_avx512_vpmadd52h_uq_128 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52huq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52h_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq128_maskz">,
+                         llvm_v2i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52l_uq_128 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52luq128">,
               Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52l_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq128_mask">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52l_uq_128 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq128_maskz">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty,
-                         llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52h_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq256_mask">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52h_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq256_maskz">,
+                         llvm_v2i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52h_uq_256 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52huq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52l_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq256_mask">,
+                         llvm_v4i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52l_uq_256 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52luq256">,
               Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52l_uq_256 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq256_maskz">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty,
-                         llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52h_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq512_mask">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52h_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52huq512_maskz">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_vpmadd52l_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq512_mask">,
+                         llvm_v4i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52h_uq_512 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52huq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_maskz_vpmadd52l_uq_512 :
-              GCCBuiltin<"__builtin_ia32_vpmadd52luq512_maskz">,
+                         llvm_v8i64_ty], [IntrNoMem]>;
+  def int_x86_avx512_vpmadd52l_uq_512 :
+              GCCBuiltin<"__builtin_ia32_vpmadd52luq512">,
               Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
-                         llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>;
+                         llvm_v8i64_ty], [IntrNoMem]>;
 }
 
 // VNNI

Modified: llvm/trunk/lib/IR/AutoUpgrade.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/IR/AutoUpgrade.cpp?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/lib/IR/AutoUpgrade.cpp (original)
+++ llvm/trunk/lib/IR/AutoUpgrade.cpp Sat May 26 11:55:19 2018
@@ -265,6 +265,8 @@ static bool ShouldUpgradeX86Intrinsic(Fu
       Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
       Name.startswith("avx512.mask.pternlog.") || // Added in 7.0
       Name.startswith("avx512.maskz.pternlog.") || // Added in 7.0
+      Name.startswith("avx512.mask.vpmadd52") || // Added in 7.0
+      Name.startswith("avx512.maskz.vpmadd52") || // Added in 7.0
       Name == "sse.cvtsi2ss" || // Added in 7.0
       Name == "sse.cvtsi642ss" || // Added in 7.0
       Name == "sse2.cvtsi2sd" || // Added in 7.0
@@ -2569,6 +2571,34 @@ void llvm::UpgradeIntrinsicCall(CallInst
       Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                  : CI->getArgOperand(0);
       Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
+    } else if (IsX86 && (Name.startswith("avx512.mask.vpmadd52") ||
+                         Name.startswith("avx512.maskz.vpmadd52"))) {
+      bool ZeroMask = Name[11] == 'z';
+      bool High = Name[20] == 'h' || Name[21] == 'h';
+      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+      Intrinsic::ID IID;
+      if (VecWidth == 128 && !High)
+        IID = Intrinsic::x86_avx512_vpmadd52l_uq_128;
+      else if (VecWidth == 256 && !High)
+        IID = Intrinsic::x86_avx512_vpmadd52l_uq_256;
+      else if (VecWidth == 512 && !High)
+        IID = Intrinsic::x86_avx512_vpmadd52l_uq_512;
+      else if (VecWidth == 128 && High)
+        IID = Intrinsic::x86_avx512_vpmadd52h_uq_128;
+      else if (VecWidth == 256 && High)
+        IID = Intrinsic::x86_avx512_vpmadd52h_uq_256;
+      else if (VecWidth == 512 && High)
+        IID = Intrinsic::x86_avx512_vpmadd52h_uq_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1),
+                        CI->getArgOperand(2) };
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID),
+                               Args);
+      Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
+                                 : CI->getArgOperand(0);
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
     } else if (IsX86 && Name.startswith("avx512.mask.") &&
                upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) {
       // Rep will be updated by the call in the condition.

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Sat May 26 11:55:19 2018
@@ -20624,26 +20624,11 @@ SDValue X86TargetLowering::LowerINTRINSI
                                               Src3),
                                   Mask, PassThru, Subtarget, DAG);
     }
-    case IFMA_OP_MASKZ:
-    case IFMA_OP_MASK: {
-      SDValue Src1 = Op.getOperand(1);
-      SDValue Src2 = Op.getOperand(2);
-      SDValue Src3 = Op.getOperand(3);
-      SDValue Mask = Op.getOperand(4);
-      MVT VT = Op.getSimpleValueType();
-      SDValue PassThru = Src1;
-
-      // set PassThru element
-      if (IntrData->Type == IFMA_OP_MASKZ)
-        PassThru = getZeroVector(VT, Subtarget, DAG, dl);
-
-      // Node we need to swizzle the operands to pass the multiply operands
+    case IFMA_OP:
+      // NOTE: We need to swizzle the operands to pass the multiply operands
       // first.
-      return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
-                                              dl, Op.getValueType(),
-                                              Src2, Src3, Src1),
-                                  Mask, PassThru, Subtarget, DAG);
-    }
+      return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+                         Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
     case CVTPD2PS:
       // ISD::FP_ROUND has a second argument that indicates if the truncation
       // does not change the value. Set it to 0 since it can change.

Modified: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h Sat May 26 11:55:19 2018
@@ -30,7 +30,7 @@ enum IntrinsicType : uint16_t {
   INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_IMM8_MASK,
   FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
   FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
-  IFMA_OP_MASK, IFMA_OP_MASKZ,
+  IFMA_OP,
   VPERM_2OP, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
   INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
   COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
@@ -1133,18 +1133,6 @@ static const IntrinsicData  IntrinsicsWi
                     X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
                     X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52L, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52L, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK,
-                     X86ISD::VPMADD52L, 0),
 
   X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
   X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
@@ -1325,18 +1313,6 @@ static const IntrinsicData  IntrinsicsWi
                      X86ISD::VPERMV3, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
                      X86ISD::VPERMV3, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52H, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52L, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52L, 0),
-  X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ,
-                     X86ISD::VPMADD52L, 0),
 
   X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
@@ -1461,6 +1437,12 @@ static const IntrinsicData  IntrinsicsWi
   X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
   X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_128 , IFMA_OP, X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_256 , IFMA_OP, X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52h_uq_512 , IFMA_OP, X86ISD::VPMADD52H, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0),
   X86_INTRINSIC_DATA(fma_vfmadd_pd,        INTR_TYPE_3OP, ISD::FMA, 0),
   X86_INTRINSIC_DATA(fma_vfmadd_pd_256,    INTR_TYPE_3OP, ISD::FMA, 0),
   X86_INTRINSIC_DATA(fma_vfmadd_ps,        INTR_TYPE_3OP, ISD::FMA, 0),

Added: llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll?rev=333346&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-fast-isel.ll Sat May 26 11:55:19 2018
@@ -0,0 +1,118 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512ifma | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512ifma-builtins.c
+
+define <8 x i64> @test_mm512_madd52hi_epu64(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_madd52hi_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_madd52hi_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+  ret <8 x i64> %0
+}
+
+define <8 x i64> @test_mm512_mask_madd52hi_epu64(<8 x i64> %__W, i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y) {
+; X32-LABEL: test_mm512_mask_madd52hi_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_mask_madd52hi_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__W, <8 x i64> %__X, <8 x i64> %__Y)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
+  ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_maskz_madd52hi_epu64(i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_maskz_madd52hi_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_maskz_madd52hi_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
+  ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_madd52lo_epu64(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_madd52lo_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_madd52lo_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+  ret <8 x i64> %0
+}
+
+define <8 x i64> @test_mm512_mask_madd52lo_epu64(<8 x i64> %__W, i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y) {
+; X32-LABEL: test_mm512_mask_madd52lo_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_mask_madd52lo_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__W, <8 x i64> %__X, <8 x i64> %__Y)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> %__W
+  ret <8 x i64> %2
+}
+
+define <8 x i64> @test_mm512_maskz_madd52lo_epu64(i8 zeroext %__M, <8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z) {
+; X32-LABEL: test_mm512_maskz_madd52lo_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm512_maskz_madd52lo_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %__X, <8 x i64> %__Y, <8 x i64> %__Z)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x i64> %0, <8 x i64> zeroinitializer
+  ret <8 x i64> %2
+}
+
+declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)

Copied: llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll (from r333344, llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll)
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll?p2=llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll&p1=llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll&r1=333344&r2=333346&rev=333346&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll Sat May 26 11:55:19 2018
@@ -6,9 +6,9 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -34,9 +34,9 @@ declare <8 x i64> @llvm.x86.avx512.maskz
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -62,9 +62,9 @@ declare <8 x i64> @llvm.x86.avx512.mask.
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -90,9 +90,9 @@ declare <8 x i64> @llvm.x86.avx512.maskz
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2

Modified: llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512ifma-intrinsics.ll Sat May 26 11:55:19 2018
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512ifma | FileCheck %s
 
-declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
 
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -19,24 +19,28 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  %res4 = add <8 x i64> %res, %res1
-  %res5 = add <8 x i64> %res3, %res2
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+  %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %x0
+  %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+  %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %res4 = add <8 x i64> %3, %6
+  %res5 = add <8 x i64> %10, %9
   %res6 = add <8 x i64> %res5, %res4
   ret <8 x i64> %res6
 }
 
-declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -47,24 +51,30 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  %res4 = add <8 x i64> %res, %res1
-  %res5 = add <8 x i64> %res3, %res2
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
+  %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+  %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %res4 = add <8 x i64> %3, %6
+  %res5 = add <8 x i64> %10, %9
   %res6 = add <8 x i64> %res5, %res4
   ret <8 x i64> %res6
 }
 
-declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>)
 
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -75,24 +85,28 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  %res4 = add <8 x i64> %res, %res1
-  %res5 = add <8 x i64> %res3, %res2
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+  %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> %x0
+  %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+  %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %res4 = add <8 x i64> %3, %6
+  %res5 = add <8 x i64> %10, %9
   %res6 = add <8 x i64> %res5, %res4
   ret <8 x i64> %res6
 }
 
-declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -103,12 +117,18 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  %res4 = add <8 x i64> %res, %res1
-  %res5 = add <8 x i64> %res3, %res2
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  %4 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
+  %7 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> zeroinitializer, <8 x i64> %x1, <8 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %9 = select <8 x i1> %8, <8 x i64> %7, <8 x i64> zeroinitializer
+  %10 = call <8 x i64> @llvm.x86.avx512.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %res4 = add <8 x i64> %3, %6
+  %res5 = add <8 x i64> %10, %9
   %res6 = add <8 x i64> %res5, %res4
   ret <8 x i64> %res6
 }
@@ -120,8 +140,8 @@ define <8 x i64>@test_int_x86_avx512_vpm
 ; CHECK-NEXT:    retq
 
   %x2 = load <8 x i64>, <8 x i64>* %x2ptr
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  ret <8 x i64> %1
 }
 
 define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr) {
@@ -133,8 +153,8 @@ define <8 x i64>@test_int_x86_avx512_vpm
   %x2load = load i64, i64* %x2ptr
   %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
   %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  ret <8 x i64> %1
 }
 
 define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2) {
@@ -144,8 +164,8 @@ define <8 x i64>@test_int_x86_avx512_vpm
 ; CHECK-NEXT:    retq
 
   %x1 = load <8 x i64>, <8 x i64>* %x1ptr
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  ret <8 x i64> %1
 }
 
 define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2) {
@@ -157,8 +177,8 @@ define <8 x i64>@test_int_x86_avx512_vpm
   %x1load = load i64, i64* %x1ptr
   %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
   %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  ret <8 x i64> %1
 }
 
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
@@ -169,8 +189,10 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    retq
 
   %x2 = load <8 x i64>, <8 x i64>* %x2ptr
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+  ret <8 x i64> %3
 }
 
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
@@ -183,8 +205,10 @@ define <8 x i64>@test_int_x86_avx512_mas
   %x2load = load i64, i64* %x2ptr
   %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
   %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+  ret <8 x i64> %3
 }
 
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -195,8 +219,10 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    retq
 
   %x1 = load <8 x i64>, <8 x i64>* %x1ptr
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+  ret <8 x i64> %3
 }
 
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -209,8 +235,10 @@ define <8 x i64>@test_int_x86_avx512_mas
   %x1load = load i64, i64* %x1ptr
   %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
   %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+  ret <8 x i64> %3
 }
 
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load(<8 x i64> %x0, <8 x i64> %x1, <8 x i64>* %x2ptr, i8 %x3) {
@@ -221,8 +249,10 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    retq
 
   %x2 = load <8 x i64>, <8 x i64>* %x2ptr
-  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
 }
 
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, <8 x i64> %x1, i64* %x2ptr, i8 %x3) {
@@ -235,8 +265,10 @@ define <8 x i64>@test_int_x86_avx512_mas
   %x2load = load i64, i64* %x2ptr
   %x2insert = insertelement <8 x i64> undef, i64 %x2load, i64 0
   %x2 = shufflevector <8 x i64> %x2insert, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
 }
 
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute(<8 x i64> %x0, <8 x i64>* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -247,8 +279,10 @@ define <8 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    retq
 
   %x1 = load <8 x i64>, <8 x i64>* %x1ptr
-  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
 }
 
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(<8 x i64> %x0, i64* %x1ptr, <8 x i64> %x2, i8 %x3) {
@@ -261,6 +295,8 @@ define <8 x i64>@test_int_x86_avx512_mas
   %x1load = load i64, i64* %x1ptr
   %x1insert = insertelement <8 x i64> undef, i64 %x1load, i64 0
   %x1 = shufflevector <8 x i64> %x1insert, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
+  %1 = call <8 x i64> @llvm.x86.avx512.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
 }

Added: llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll?rev=333346&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll (added)
+++ llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-fast-isel.ll Sat May 26 11:55:19 2018
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X32
+; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512ifma,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512ifmavl-builtins.c
+
+define <2 x i64> @test_mm_madd52hi_epu64(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_madd52hi_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_madd52hi_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @test_mm_mask_madd52hi_epu64(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) {
+; X32-LABEL: test_mm_mask_madd52hi_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_madd52hi_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__W, <2 x i64> %__X, <2 x i64> %__Y)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_mm_maskz_madd52hi_epu64(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_maskz_madd52hi_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_madd52hi_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @test_mm256_madd52hi_epu64(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_madd52hi_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_madd52hi_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+  ret <4 x i64> %0
+}
+
+define <4 x i64> @test_mm256_mask_madd52hi_epu64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) {
+; X32-LABEL: test_mm256_mask_madd52hi_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_madd52hi_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__W, <4 x i64> %__X, <4 x i64> %__Y)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_mm256_maskz_madd52hi_epu64(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_maskz_madd52hi_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_madd52hi_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
+  ret <4 x i64> %2
+}
+
+define <2 x i64> @test_mm_madd52lo_epu64(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_madd52lo_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_madd52lo_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @test_mm_mask_madd52lo_epu64(<2 x i64> %__W, i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y) {
+; X32-LABEL: test_mm_mask_madd52lo_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_madd52lo_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__W, <2 x i64> %__X, <2 x i64> %__Y)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> %__W
+  ret <2 x i64> %2
+}
+
+define <2 x i64> @test_mm_maskz_madd52lo_epu64(i8 zeroext %__M, <2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z) {
+; X32-LABEL: test_mm_maskz_madd52lo_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_madd52lo_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %__X, <2 x i64> %__Y, <2 x i64> %__Z)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x i64> %0, <2 x i64> zeroinitializer
+  ret <2 x i64> %2
+}
+
+define <4 x i64> @test_mm256_madd52lo_epu64(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_madd52lo_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_madd52lo_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+  ret <4 x i64> %0
+}
+
+define <4 x i64> @test_mm256_mask_madd52lo_epu64(<4 x i64> %__W, i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y) {
+; X32-LABEL: test_mm256_mask_madd52lo_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_madd52lo_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__W, <4 x i64> %__X, <4 x i64> %__Y)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> %__W
+  ret <4 x i64> %2
+}
+
+define <4 x i64> @test_mm256_maskz_madd52lo_epu64(i8 zeroext %__M, <4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z) {
+; X32-LABEL: test_mm256_maskz_madd52lo_epu64:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_madd52lo_epu64:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %__X, <4 x i64> %__Y, <4 x i64> %__Z)
+  %1 = bitcast i8 %__M to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x i64> %0, <4 x i64> zeroinitializer
+  ret <4 x i64> %2
+}
+
+declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)

Copied: llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll (from r333344, llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll)
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll?p2=llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll&p1=llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll&r1=333344&r2=333346&rev=333346&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics-upgrade.ll Sat May 26 11:55:19 2018
@@ -6,9 +6,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -34,9 +34,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -62,9 +62,9 @@ declare <2 x i64> @llvm.x86.avx512.maskz
 define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -90,9 +90,9 @@ declare <4 x i64> @llvm.x86.avx512.maskz
 define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -118,9 +118,9 @@ declare <2 x i64> @llvm.x86.avx512.mask.
 define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -146,9 +146,9 @@ declare <4 x i64> @llvm.x86.avx512.mask.
 define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -174,9 +174,9 @@ declare <2 x i64> @llvm.x86.avx512.maskz
 define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -202,9 +202,9 @@ declare <4 x i64> @llvm.x86.avx512.maskz
 define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2

Modified: llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll?rev=333346&r1=333345&r2=333346&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512ifmavl-intrinsics.ll Sat May 26 11:55:19 2018
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s
 
-declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
 
 define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -19,24 +19,33 @@ define <2 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
-  %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
-  %res4 = add <2 x i64> %res, %res1
-  %res5 = add <2 x i64> %res3, %res2
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> %x0
+  %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> %x0
+  %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+  %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+  %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %res4 = add <2 x i64> %3, %6
+  %res5 = add <2 x i64> %10, %9
   %res6 = add <2 x i64> %res5, %res4
   ret <2 x i64> %res6
 }
 
-declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
 
 define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -47,24 +56,31 @@ define <4 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
-  %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
-  %res4 = add <4 x i64> %res, %res1
-  %res5 = add <4 x i64> %res3, %res2
+  %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> %x0
+  %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> %x0
+  %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+  %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %res4 = add <4 x i64> %3, %6
+  %res5 = add <4 x i64> %10, %9
   %res6 = add <4 x i64> %res5, %res4
   ret <4 x i64> %res6
 }
 
-declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
 define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -75,24 +91,31 @@ define <2 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
-  %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
-  %res4 = add <2 x i64> %res, %res1
-  %res5 = add <2 x i64> %res3, %res2
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> zeroinitializer
+  %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> zeroinitializer
+  %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+  %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+  %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52h.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %res4 = add <2 x i64> %3, %6
+  %res5 = add <2 x i64> %10, %9
   %res6 = add <2 x i64> %res5, %res4
   ret <2 x i64> %res6
 }
 
-declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
 define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -103,24 +126,33 @@ define <4 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 
-  %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
-  %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
-  %res4 = add <4 x i64> %res, %res1
-  %res5 = add <4 x i64> %res3, %res2
+  %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> zeroinitializer
+  %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> zeroinitializer
+  %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+  %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52h.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %res4 = add <4 x i64> %3, %6
+  %res5 = add <4 x i64> %10, %9
   %res6 = add <4 x i64> %res5, %res4
   ret <4 x i64> %res6
 }
 
-declare <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>)
 
 define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -131,24 +163,33 @@ define <2 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
-  %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <2 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
-  %res4 = add <2 x i64> %res, %res1
-  %res5 = add <2 x i64> %res3, %res2
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> %x0
+  %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> %x0
+  %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+  %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+  %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %res4 = add <2 x i64> %3, %6
+  %res5 = add <2 x i64> %10, %9
   %res6 = add <2 x i64> %res5, %res4
   ret <2 x i64> %res6
 }
 
-declare <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>)
 
 define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -159,24 +200,31 @@ define <4 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
-  %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <4 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
-  %res4 = add <4 x i64> %res, %res1
-  %res5 = add <4 x i64> %res3, %res2
+  %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> %x0
+  %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> %x0
+  %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+  %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %res4 = add <4 x i64> %3, %6
+  %res5 = add <4 x i64> %10, %9
   %res6 = add <4 x i64> %res5, %res4
   ret <4 x i64> %res6
 }
 
-declare <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
-
 define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -187,24 +235,31 @@ define <2 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 
-  %res = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
-  %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <2 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
-  %res4 = add <2 x i64> %res, %res1
-  %res5 = add <2 x i64> %res3, %res2
+  %1 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract2, <2 x i64> %1, <2 x i64> zeroinitializer
+  %4 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <2 x i32> <i32 0, i32 1>
+  %6 = select <2 x i1> %extract1, <2 x i64> %4, <2 x i64> zeroinitializer
+  %7 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> zeroinitializer, <2 x i64> %x1, <2 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <2 x i32> <i32 0, i32 1>
+  %9 = select <2 x i1> %extract, <2 x i64> %7, <2 x i64> zeroinitializer
+  %10 = call <2 x i64> @llvm.x86.avx512.vpmadd52l.uq.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2)
+  %res4 = add <2 x i64> %3, %6
+  %res5 = add <2 x i64> %10, %9
   %res6 = add <2 x i64> %res5, %res4
   ret <2 x i64> %res6
 }
 
-declare <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
-
 define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
@@ -215,12 +270,21 @@ define <4 x i64>@test_int_x86_avx512_mas
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 
-  %res = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
-  %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
-  %res3 = call <4 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
-  %res4 = add <4 x i64> %res, %res1
-  %res5 = add <4 x i64> %res3, %res2
+  %1 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %extract2 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract2, <4 x i64> %1, <4 x i64> zeroinitializer
+  %4 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %6 = select <4 x i1> %extract1, <4 x i64> %4, <4 x i64> zeroinitializer
+  %7 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> zeroinitializer, <4 x i64> %x1, <4 x i64> zeroinitializer)
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %extract = shufflevector <8 x i1> %8, <8 x i1> %8, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %9 = select <4 x i1> %extract, <4 x i64> %7, <4 x i64> zeroinitializer
+  %10 = call <4 x i64> @llvm.x86.avx512.vpmadd52l.uq.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2)
+  %res4 = add <4 x i64> %3, %6
+  %res5 = add <4 x i64> %10, %9
   %res6 = add <4 x i64> %res5, %res4
   ret <4 x i64> %res6
 }




More information about the llvm-commits mailing list