[llvm] r258047 - AVX512: Masked store intrinsic implementation.

Igor Breger via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 18 05:52:57 PST 2016


Author: ibreger
Date: Mon Jan 18 07:52:57 2016
New Revision: 258047

URL: http://llvm.org/viewvc/llvm-project?rev=258047&view=rev
Log:
AVX512: Masked store intrinsic implementation.
Implemented intrinsic for the follow instructions (store) : VMOVDQU8/16/32/64, VMOVDQA32/64, VMOVAPS/PD, VMOVUPS/PD.

Differential Revision: http://reviews.llvm.org/D16271

Modified:
    llvm/trunk/include/llvm/IR/IntrinsicsX86.td
    llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
    llvm/trunk/lib/Target/X86/X86InstrAVX512.td
    llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
    llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
    llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll

Modified: llvm/trunk/include/llvm/IR/IntrinsicsX86.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsX86.td?rev=258047&r1=258046&r2=258047&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsX86.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td Mon Jan 18 07:52:57 2016
@@ -1971,22 +1971,59 @@ let TargetPrefix = "x86" in {  // All in
         GCCBuiltin<"__builtin_ia32_maskstoreps256">,
         Intrinsic<[], [llvm_ptr_ty,
                   llvm_v8i32_ty, llvm_v8f32_ty], [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_mask_storeu_ps_128 : 
+        GCCBuiltin<"__builtin_ia32_storeups128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_storeu_ps_256 : 
+        GCCBuiltin<"__builtin_ia32_storeups256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx512_mask_storeu_ps_512 :
         GCCBuiltin<"__builtin_ia32_storeups512_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty],
                   [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_mask_storeu_pd_128 : 
+        GCCBuiltin<"__builtin_ia32_storeupd128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_storeu_pd_256 : 
+        GCCBuiltin<"__builtin_ia32_storeupd256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx512_mask_storeu_pd_512 :
         GCCBuiltin<"__builtin_ia32_storeupd512_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
                   [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_mask_store_ps_128 : 
+        GCCBuiltin<"__builtin_ia32_storeaps128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_store_ps_256 : 
+        GCCBuiltin<"__builtin_ia32_storeaps256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx512_mask_store_ps_512 :
         GCCBuiltin<"__builtin_ia32_storeaps512_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty],
                   [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_mask_store_pd_128 : 
+        GCCBuiltin<"__builtin_ia32_storeapd128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_store_pd_256 : 
+        GCCBuiltin<"__builtin_ia32_storeapd256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx512_mask_store_pd_512 :
         GCCBuiltin<"__builtin_ia32_storeapd512_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
-                  [IntrReadWriteArgMem]>;
+                  [IntrReadWriteArgMem]>;        
+        
   def int_x86_avx512_mask_store_ss :
         GCCBuiltin<"__builtin_ia32_storess_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty],
@@ -2894,14 +2931,84 @@ let TargetPrefix = "x86" in {  // All in
         GCCBuiltin<"__builtin_ia32_maskstoreq256">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty],
                   [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_mask_storeu_b_128 : 
+        GCCBuiltin<"__builtin_ia32_storedquqi128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v16i8_ty, llvm_i16_ty],
+                  [IntrReadWriteArgMem]>;  
+  def int_x86_avx512_mask_storeu_b_256 : 
+        GCCBuiltin<"__builtin_ia32_storedquqi256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty, llvm_i32_ty],
+                  [IntrReadWriteArgMem]>;  
+  def int_x86_avx512_mask_storeu_b_512 : 
+        GCCBuiltin<"__builtin_ia32_storedquqi512_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v64i8_ty, llvm_i64_ty],
+                  [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_mask_storeu_w_128 : 
+        GCCBuiltin<"__builtin_ia32_storedquhi128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;  
+  def int_x86_avx512_mask_storeu_w_256 : 
+        GCCBuiltin<"__builtin_ia32_storedquhi256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty],
+                  [IntrReadWriteArgMem]>;  
+  def int_x86_avx512_mask_storeu_w_512 : 
+         GCCBuiltin<"__builtin_ia32_storedquhi512_mask">,
+         Intrinsic<[], [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty],
+                   [IntrReadWriteArgMem]>;
+
+  def int_x86_avx512_mask_storeu_d_128 :
+        GCCBuiltin<"__builtin_ia32_storedqusi128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_storeu_d_256 :
+        GCCBuiltin<"__builtin_ia32_storedqusi256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;     
   def int_x86_avx512_mask_storeu_d_512 :
         GCCBuiltin<"__builtin_ia32_storedqusi512_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
                   [IntrReadWriteArgMem]>;
+        
+  def int_x86_avx512_mask_storeu_q_128 :
+        GCCBuiltin<"__builtin_ia32_storedqudi128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_storeu_q_256 :
+        GCCBuiltin<"__builtin_ia32_storedqudi256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx512_mask_storeu_q_512 :
         GCCBuiltin<"__builtin_ia32_storedqudi512_mask">,
         Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
                   [IntrReadWriteArgMem]>;
+        
+  def int_x86_avx512_mask_store_d_128 :
+        GCCBuiltin<"__builtin_ia32_movdqa32store128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_store_d_256 :
+        GCCBuiltin<"__builtin_ia32_movdqa32store256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;     
+  def int_x86_avx512_mask_store_d_512 :
+        GCCBuiltin<"__builtin_ia32_movdqa32store512_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty],
+                  [IntrReadWriteArgMem]>;
+        
+  def int_x86_avx512_mask_store_q_128 :
+        GCCBuiltin<"__builtin_ia32_movdqa64store128_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_store_q_256 :
+        GCCBuiltin<"__builtin_ia32_movdqa64store256_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;
+  def int_x86_avx512_mask_store_q_512 :
+        GCCBuiltin<"__builtin_ia32_movdqa64store512_mask">,
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty],
+                  [IntrReadWriteArgMem]>;        
 }
 
 // Variable bit shift ops

Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=258047&r1=258046&r2=258047&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Mon Jan 18 07:52:57 2016
@@ -4227,6 +4227,14 @@ bool X86TargetLowering::getTgtMemIntrins
     Info.writeMem = true;
     break;
   }
+  case STOREA:
+  case STOREU: {
+    Info.ptrVal = I.getArgOperand(0);
+    Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
+    Info.align = (IntrData->Type == STOREA ? Info.memVT.getSizeInBits()/8 : 1);
+    Info.writeMem = true;
+    break;
+  }
   default:
     return false;
   }
@@ -17659,6 +17667,26 @@ static SDValue LowerINTRINSIC_W_CHAIN(SD
     return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
                              MemIntr->getMemOperand(), ISD::NON_EXTLOAD);
   }
+  case STOREU:
+  case STOREA: {
+    SDValue Mask = Op.getOperand(4);
+    SDValue Data = Op.getOperand(3);
+    SDValue Addr = Op.getOperand(2);
+    SDValue Chain = Op.getOperand(0);
+
+    MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
+    assert(MemIntr && "Expected MemIntrinsicSDNode!");
+
+    if (isAllOnesConstant(Mask)) // return just a store
+      return DAG.getStore(Chain, dl, Data, Addr, MemIntr->getMemOperand());
+
+    EVT VT  = MemIntr->getMemoryVT();
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
+    SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+
+    return DAG.getMaskedStore(Chain, dl, Data, Addr, VMask, VT,
+                              MemIntr->getMemOperand(), false);
+  }
   }
 }
 

Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=258047&r1=258046&r2=258047&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Mon Jan 18 07:52:57 2016
@@ -2707,24 +2707,6 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmo
                avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
-def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
-          GR16:$mask),
-         (VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
-            VR512:$src)>;
-def: Pat<(int_x86_avx512_mask_storeu_pd_512 addr:$ptr, (v8f64 VR512:$src),
-          GR8:$mask),
-         (VMOVUPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
-            VR512:$src)>;
-
-def: Pat<(int_x86_avx512_mask_store_ps_512 addr:$ptr, (v16f32 VR512:$src),
-          GR16:$mask),
-         (VMOVAPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
-            VR512:$src)>;
-def: Pat<(int_x86_avx512_mask_store_pd_512 addr:$ptr, (v8f64 VR512:$src),
-          GR8:$mask),
-         (VMOVAPDZmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
-            VR512:$src)>;
-
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
@@ -2759,15 +2741,6 @@ def: Pat<(v8i64 (int_x86_avx512_mask_loa
                 (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
        (VMOVDQU64Zrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
 
-def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src),
-            GR16:$mask),
-         (VMOVDQU32Zmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
-            VR512:$src)>;
-def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src),
-            GR8:$mask),
-         (VMOVDQU64Zmrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
-            VR512:$src)>;
-
 let AddedComplexity = 20 in {
 def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
                           (bc_v8i64 (v16i32 immAllZerosV)))),

Modified: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h?rev=258047&r1=258046&r2=258047&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h (original)
+++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h Mon Jan 18 07:52:57 2016
@@ -29,8 +29,9 @@ enum IntrinsicType {
   INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
   COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
-  EXPAND_FROM_MEM, LOADA, LOADU, BLEND, INSERT_SUBVEC,
-  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+  EXPAND_FROM_MEM, LOADA, LOADU, STOREA, STOREU, BLEND, INSERT_SUBVEC,
+  TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK,
+  CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
 };
 
 struct IntrinsicData {
@@ -197,6 +198,36 @@ static const IntrinsicData IntrinsicsWit
                      X86ISD::VTRUNC, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmov_wb_mem_512, TRUNCATE_TO_MEM_VI8,
                      X86ISD::VTRUNC, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_d_128, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_d_256, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_d_512, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_pd_128, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_pd_256, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_pd_512, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_ps_128, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_ps_256, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_ps_512, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_q_128, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_q_256, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_store_q_512, STOREA, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_b_128, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_b_256, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_b_512, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_d_128, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_d_256, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_d_512, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_pd_128, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_pd_256, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_pd_512, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_ps_128, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_ps_256, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_ps_512, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_q_128, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_q_256, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_q_512, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_w_128, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_w_256, STOREU, ISD::DELETED_NODE, 0),
+  X86_INTRINSIC_DATA(avx512_mask_storeu_w_512, STOREU, ISD::DELETED_NODE, 0),
   X86_INTRINSIC_DATA(avx512_scatter_dpd_512, SCATTER, X86::VSCATTERDPDZmr, 0),
   X86_INTRINSIC_DATA(avx512_scatter_dpi_512, SCATTER, X86::VPSCATTERDDZmr, 0),
   X86_INTRINSIC_DATA(avx512_scatter_dpq_512, SCATTER, X86::VPSCATTERDQZmr, 0),

Modified: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll?rev=258047&r1=258046&r2=258047&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll Mon Jan 18 07:52:57 2016
@@ -858,49 +858,57 @@ define i16 @test_vptestmd(<16 x i32> %a0
 }
 declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
 
-define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
+define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
 ; CHECK-LABEL: test_store1:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovups %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovups %zmm0, (%rsi)
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
+  call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
   ret void
 }
 
 declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
 
-define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
+define void @test_store2(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
 ; CHECK-LABEL: test_store2:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovupd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovupd %zmm0, (%rsi)
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
+  call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
   ret void
 }
 
 declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8)
 
-define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
+define void @test_mask_store_aligned_ps(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) {
 ; CHECK-LABEL: test_mask_store_aligned_ps:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovaps %zmm0, (%rsi)
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
+  call void @llvm.x86.avx512.mask.store.ps.512(i8* %ptr2, <16 x float> %data, i16 -1)
   ret void
 }
 
 declare void @llvm.x86.avx512.mask.store.ps.512(i8*, <16 x float>, i16 )
 
-define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
+define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8* %ptr2, i8 %mask) {
 ; CHECK-LABEL: test_mask_store_aligned_pd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovw %edx, %k1
 ; CHECK-NEXT:    vmovapd %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovapd %zmm0, (%rsi)
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
+  call void @llvm.x86.avx512.mask.store.pd.512(i8* %ptr2, <8 x double> %data, i8 -1)
   ret void
 }
 
@@ -922,6 +930,62 @@ define <16 x float> @test_mask_load_alig
   ret <16 x float> %res4
 }
 
+declare void @llvm.x86.avx512.mask.storeu.q.512(i8*, <8 x i64>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqu64 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu64 %zmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.storeu.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.512(i8*, <16 x i32>, i16)
+
+define void at test_int_x86_avx512_mask_storeu_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqu32 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu32 %zmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
+  call void @llvm.x86.avx512.mask.storeu.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.512(i8*, <8 x i64>, i8)
+
+define void at test_int_x86_avx512_mask_store_q_512(i8* %ptr1, i8* %ptr2, <8 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr1, <8 x i64> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.store.q.512(i8* %ptr2, <8 x i64> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.512(i8*, <16 x i32>, i16)
+
+define void at test_int_x86_avx512_mask_store_d_512(i8* %ptr1, i8* %ptr2, <16 x i32> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqa32 %zmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqa32 %zmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr1, <16 x i32> %x1, i16 %x2)
+  call void @llvm.x86.avx512.mask.store.d.512(i8* %ptr2, <16 x i32> %x1, i16 -1)
+  ret void
+}
+
 declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
 
 define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
@@ -6897,8 +6961,6 @@ define <8 x i64>@test_int_x86_avx512_mas
   ret <8 x i64> %res4
 }
 
-
-
 declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) {
@@ -6938,4 +7000,3 @@ define <16 x i32>@test_int_x86_avx512_ma
   %res4 = add <16 x i32> %res3, %res2
   ret <16 x i32> %res4
 }
-

Modified: llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll?rev=258047&r1=258046&r2=258047&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bw-intrinsics.ll Mon Jan 18 07:52:57 2016
@@ -3107,7 +3107,6 @@ define <32 x i16>@test_int_x86_avx512_ma
   ret <32 x i16> %res4
 }
 
-
 declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) {
@@ -3147,3 +3146,51 @@ define <32 x i16>@test_int_x86_avx512_ma
   %res4 = add <32 x i16> %res3, %res2
   ret <32 x i16> %res4
 }
+
+declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64)
+
+define void at test_int_x86_avx512_mask_storeu_b_512(i8* %ptr1, i8* %ptr2, <64 x i8> %x1, i64 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdx, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    vmovdqu8 %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT:    vmovdqu8 %zmm0, (%eax)
+; AVX512F-32-NEXT:    retl
+  call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr1, <64 x i8> %x1, i64 %x2)
+  call void @llvm.x86.avx512.mask.storeu.b.512(i8* %ptr2, <64 x i8> %x1, i64 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.w.512(i8*, <32 x i16>, i32)
+
+define void at test_int_x86_avx512_mask_storeu_w_512(i8* %ptr1, i8* %ptr2, <32 x i16> %x1, i32 %x2) {
+; AVX512BW-LABEL: test_int_x86_avx512_mask_storeu_w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edx, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rdi) {%k1}
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_mask_storeu_w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; AVX512F-32-NEXT:    vmovdqu16 %zmm0, (%ecx) {%k1}
+; AVX512F-32-NEXT:    vmovdqu16 %zmm0, (%eax)
+; AVX512F-32-NEXT:    retl
+  call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr1, <32 x i16> %x1, i32 %x2)
+  call void @llvm.x86.avx512.mask.storeu.w.512(i8* %ptr2, <32 x i16> %x1, i32 -1)
+  ret void
+}

Modified: llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll?rev=258047&r1=258046&r2=258047&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512bwvl-intrinsics.ll Mon Jan 18 07:52:57 2016
@@ -5107,3 +5107,59 @@ define <16 x i16>@test_int_x86_avx512_ma
   %res4 = add <16 x i16> %res3, %res2
   ret <16 x i16> %res4
 }
+
+declare void @llvm.x86.avx512.mask.storeu.b.128(i8*, <16 x i8>, i16)
+
+define void at test_int_x86_avx512_mask_storeu_b_128(i8* %ptr1, i8* %ptr2, <16 x i8> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqu8 %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu8 %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr1, <16 x i8> %x1, i16 %x2)
+  call void @llvm.x86.avx512.mask.storeu.b.128(i8* %ptr2, <16 x i8> %x1, i16 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.b.256(i8*, <32 x i8>, i32)
+
+define void at test_int_x86_avx512_mask_storeu_b_256(i8* %ptr1, i8* %ptr2, <32 x i8> %x1, i32 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edx, %k1
+; CHECK-NEXT:    vmovdqu8 %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu8 %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr1, <32 x i8> %x1, i32 %x2)
+  call void @llvm.x86.avx512.mask.storeu.b.256(i8* %ptr2, <32 x i8> %x1, i32 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.w.128(i8*, <8 x i16>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_w_128(i8* %ptr1, i8* %ptr2, <8 x i16> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqu16 %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu16 %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr1, <8 x i16> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.storeu.w.128(i8* %ptr2, <8 x i16> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.w.256(i8*, <16 x i16>, i16)
+
+define void at test_int_x86_avx512_mask_storeu_w_256(i8* %ptr1, i8* %ptr2, <16 x i16> %x1, i16 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqu16 %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu16 %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr1, <16 x i16> %x1, i16 %x2)
+  call void @llvm.x86.avx512.mask.storeu.w.256(i8* %ptr2, <16 x i16> %x1, i16 -1)
+  ret void
+}

Modified: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll?rev=258047&r1=258046&r2=258047&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll Mon Jan 18 07:52:57 2016
@@ -7466,3 +7466,227 @@ define <8 x i32>@test_int_x86_avx512_mas
   %res4 = add <8 x i32> %res3, %res2
   ret <8 x i32> %res4
 }
+
+declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8)
+
+define void at test_int_x86_avx512_mask_store_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovapd %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovapd %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.store.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.pd.256(i8*, <4 x double>, i8)
+
+define void at test_int_x86_avx512_mask_store_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovapd %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovapd %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.store.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.pd.128(i8*, <2 x double>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_pd_128(i8* %ptr1, i8* %ptr2, <2 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovupd %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovupd %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr1, <2 x double> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.storeu.pd.128(i8* %ptr2, <2 x double> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.pd.256(i8*, <4 x double>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_pd_256(i8* %ptr1, i8* %ptr2, <4 x double> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovupd %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovupd %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr1, <4 x double> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.storeu.pd.256(i8* %ptr2, <4 x double> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.128(i8*, <4 x float>, i8)
+
+define void at test_int_x86_avx512_mask_store_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovaps %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovaps %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2)
+    call void @llvm.x86.avx512.mask.store.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1)
+    ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.256(i8*, <8 x float>, i8)
+
+define void at test_int_x86_avx512_mask_store_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovaps %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovaps %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2)
+    call void @llvm.x86.avx512.mask.store.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1)
+    ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.ps.128(i8*, <4 x float>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_ps_128(i8* %ptr1, i8* %ptr2, <4 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovups %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovups %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr1, <4 x float> %x1, i8 %x2)
+    call void @llvm.x86.avx512.mask.storeu.ps.128(i8* %ptr2, <4 x float> %x1, i8 -1)
+    ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.ps.256(i8*, <8 x float>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_ps_256(i8* %ptr1, i8* %ptr2, <8 x float> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovups %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovups %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+    call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr1, <8 x float> %x1, i8 %x2)
+    call void @llvm.x86.avx512.mask.storeu.ps.256(i8* %ptr2, <8 x float> %x1, i8 -1)
+    ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.q.128(i8*, <2 x i64>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqu64 %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu64 %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.storeu.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.q.256(i8*, <4 x i64>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqu64 %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu64 %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.storeu.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.128(i8*, <4 x i32>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqu32 %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu32 %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.storeu.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.256(i8*, <8 x i32>, i8)
+
+define void at test_int_x86_avx512_mask_storeu_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_storeu_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqu32 %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqu32 %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.storeu.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.128(i8*, <2 x i64>, i8)
+
+define void at test_int_x86_avx512_mask_store_q_128(i8* %ptr1, i8* %ptr2, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqa64 %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqa64 %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr1, <2 x i64> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.store.q.128(i8* %ptr2, <2 x i64> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.256(i8*, <4 x i64>, i8)
+
+define void at test_int_x86_avx512_mask_store_q_256(i8* %ptr1, i8* %ptr2, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqa64 %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqa64 %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr1, <4 x i64> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.store.q.256(i8* %ptr2, <4 x i64> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.128(i8*, <4 x i32>, i8)
+
+define void at test_int_x86_avx512_mask_store_d_128(i8* %ptr1, i8* %ptr2, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqa32 %xmm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqa32 %xmm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr1, <4 x i32> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.store.d.128(i8* %ptr2, <4 x i32> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.256(i8*, <8 x i32>, i8)
+
+define void at test_int_x86_avx512_mask_store_d_256(i8* %ptr1, i8* %ptr2, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_store_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vmovdqa32 %ymm0, (%rdi) {%k1}
+; CHECK-NEXT:    vmovdqa32 %ymm0, (%rsi)
+; CHECK-NEXT:    retq
+  call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr1, <8 x i32> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.store.d.256(i8* %ptr2, <8 x i32> %x1, i8 -1)
+  ret void
+}




More information about the llvm-commits mailing list