[llvm] 9b1c98c - [X86] Add 32-bit command lines to masked_store.ll and masked_load.ll
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 20 13:47:28 PDT 2020
Author: Craig Topper
Date: 2020-09-20T13:46:59-07:00
New Revision: 9b1c98c0fbe2d7fdc22debd3e7d1fcf44952a0ce
URL: https://github.com/llvm/llvm-project/commit/9b1c98c0fbe2d7fdc22debd3e7d1fcf44952a0ce
DIFF: https://github.com/llvm/llvm-project/commit/9b1c98c0fbe2d7fdc22debd3e7d1fcf44952a0ce.diff
LOG: [X86] Add 32-bit command lines to masked_store.ll and masked_load.ll
Added:
Modified:
llvm/test/CodeGen/X86/masked_load.ll
llvm/test/CodeGen/X86/masked_store.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index c0b274d85abd..30f4e9f56526 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -6,6 +6,7 @@
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512f,avx512bw,avx512dq,avx512vl | FileCheck %s --check-prefixes=X86-AVX512
;
; vXf64
@@ -29,6 +30,25 @@ define <1 x double> @load_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, <
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: LBB0_2: ## %else
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v1f64_v1i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: subl $12, %esp
+; X86-AVX512-NEXT: .cfi_def_cfa_offset 16
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: jne LBB0_1
+; X86-AVX512-NEXT: ## %bb.2: ## %cond.load
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT: jmp LBB0_3
+; X86-AVX512-NEXT: LBB0_1:
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT: LBB0_3: ## %else
+; X86-AVX512-NEXT: vmovsd %xmm0, (%esp)
+; X86-AVX512-NEXT: fldl (%esp)
+; X86-AVX512-NEXT: addl $12, %esp
+; X86-AVX512-NEXT: retl
%mask = icmp eq <1 x i64> %trigger, zeroinitializer
%res = call <1 x double> @llvm.masked.load.v1f64.p0v1f64(<1 x double>* %addr, i32 4, <1 x i1> %mask, <1 x double> %dst)
ret <1 x double> %res
@@ -106,6 +126,13 @@ define <2 x double> @load_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <
; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vblendmpd (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v2f64_v2i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmq %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vblendmpd (%eax), %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> %mask, <2 x double> %dst)
ret <2 x double> %res
@@ -185,6 +212,13 @@ define <4 x double> @load_v4f64_v4i32(<4 x i32> %trigger, <4 x double>* %addr, <
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v4f64_v4i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vblendmpd (%eax), %ymm1, %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double> %dst)
ret <4 x double> %res
@@ -262,6 +296,13 @@ define <4 x double> @load_v4f64_v4i32_zero(<4 x i32> %trigger, <4 x double>* %ad
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovapd (%rdi), %ymm0 {%k1} {z}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v4f64_v4i32_zero:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vmovapd (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 32, <4 x i1> %mask, <4 x double>zeroinitializer)
ret <4 x double> %res
@@ -383,6 +424,13 @@ define <4 x double> @load_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <
; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vblendmpd (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v4f64_v4i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmq %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT: vblendmpd (%eax), %ymm1, %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <4 x i64> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> %mask, <4 x double> %dst)
ret <4 x double> %res
@@ -513,6 +561,13 @@ define <8 x double> @load_v8f64_v8i16(<8 x i16> %trigger, <8 x double>* %addr, <
; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v8f64_v8i16:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vblendmpd (%eax), %zmm1, %zmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <8 x i16> %trigger, zeroinitializer
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
ret <8 x double> %res
@@ -709,6 +764,13 @@ define <8 x double> @load_v8f64_v8i64(<8 x i64> %trigger, <8 x double>* %addr, <
; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
; AVX512-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v8f64_v8i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
+; X86-AVX512-NEXT: vblendmpd (%eax), %zmm1, %zmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <8 x i64> %trigger, zeroinitializer
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> %mask, <8 x double> %dst)
ret <8 x double> %res
@@ -806,6 +868,15 @@ define <2 x float> @load_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLBW-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v2f32_v2i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0
+; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT: vblendmps (%eax), %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
ret <2 x float> %res
@@ -893,6 +964,15 @@ define <2 x float> @load_v2f32_v2i32_undef(<2 x i32> %trigger, <2 x float>* %add
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v2f32_v2i32_undef:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0
+; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT: vmovups (%eax), %xmm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float>undef)
ret <2 x float> %res
@@ -1005,6 +1085,13 @@ define <4 x float> @load_v4f32_v4i32(<4 x i32> %trigger, <4 x float>* %addr, <4
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vblendmps (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v4f32_v4i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vblendmps (%eax), %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> %mask, <4 x float> %dst)
ret <4 x float> %res
@@ -1197,6 +1284,14 @@ define <8 x float> @load_v8f32_v8i1_zero(<8 x i1> %mask, <8 x float>* %addr) {
; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
; AVX512VLBW-NEXT: vmovaps (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v8f32_v8i1_zero:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0
+; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vmovaps (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> zeroinitializer)
ret <8 x float> %res
}
@@ -1389,6 +1484,13 @@ define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, <8 x float>* %addr, <8
; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vblendmps (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v8f32_v8i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT: vblendmps (%eax), %ymm1, %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 32, <8 x i1> %mask, <8 x float> %dst)
ret <8 x float> %res
@@ -1421,6 +1523,21 @@ define <1 x i64> @load_v1i64_v1i64(<1 x i64> %trigger, <1 x i64>* %addr, <1 x i6
; AVX-NEXT: LBB12_1:
; AVX-NEXT: movq %rdx, %rax
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v1i64_v1i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: orl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: jne LBB12_1
+; X86-AVX512-NEXT: ## %bb.2: ## %cond.load
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: movl (%ecx), %eax
+; X86-AVX512-NEXT: movl 4(%ecx), %edx
+; X86-AVX512-NEXT: retl
+; X86-AVX512-NEXT: LBB12_1:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: retl
%mask = icmp eq <1 x i64> %trigger, zeroinitializer
%res = call <1 x i64> @llvm.masked.load.v1i64.p0v1i64(<1 x i64>* %addr, i32 4, <1 x i1> %mask, <1 x i64> %dst)
ret <1 x i64> %res
@@ -1507,6 +1624,13 @@ define <2 x i64> @load_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i6
; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpblendmq (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v2i64_v2i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmq %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vpblendmq (%eax), %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
%res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> %mask, <2 x i64> %dst)
ret <2 x i64> %res
@@ -1630,6 +1754,13 @@ define <4 x i64> @load_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i6
; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpblendmq (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v4i64_v4i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmq %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT: vpblendmq (%eax), %ymm1, %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <4 x i64> %trigger, zeroinitializer
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> %mask, <4 x i64> %dst)
ret <4 x i64> %res
@@ -1831,6 +1962,13 @@ define <8 x i64> @load_v8i64_v8i16(<8 x i16> %trigger, <8 x i64>* %addr, <8 x i6
; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v8i64_v8i16:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vpblendmq (%eax), %zmm1, %zmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <8 x i16> %trigger, zeroinitializer
%res = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
ret <8 x i64> %res
@@ -2031,6 +2169,13 @@ define <8 x i64> @load_v8i64_v8i64(<8 x i64> %trigger, <8 x i64>* %addr, <8 x i6
; AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
; AVX512-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v8i64_v8i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmq %zmm0, %zmm0, %k1
+; X86-AVX512-NEXT: vpblendmq (%eax), %zmm1, %zmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <8 x i64> %trigger, zeroinitializer
%res = call <8 x i64> @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %addr, i32 4, <8 x i1> %mask, <8 x i64> %dst)
ret <8 x i64> %res
@@ -2136,6 +2281,15 @@ define <2 x i32> @load_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i3
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLBW-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v2i32_v2i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0
+; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT: vpblendmd (%eax), %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %addr, i32 4, <2 x i1> %mask, <2 x i32> %dst)
ret <2 x i32> %res
@@ -2255,6 +2409,13 @@ define <4 x i32> @load_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i3
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpblendmd (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v4i32_v4i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vpblendmd (%eax), %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> %mask, <4 x i32> %dst)
ret <4 x i32> %res
@@ -2448,6 +2609,14 @@ define <8 x i32> @load_v8i32_v8i1(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %d
; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
; AVX512VLBW-NEXT: vpblendmd (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v8i32_v8i1:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0
+; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpblendmd (%eax), %ymm1, %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> %dst)
ret <8 x i32> %res
}
@@ -2638,6 +2807,14 @@ define <8 x i32> @load_v8i32_v8i1_zero(<8 x i1> %mask, <8 x i32>* %addr) {
; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v8i32_v8i1_zero:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: vpsllw $15, %xmm0, %xmm0
+; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vmovdqu32 (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> %mask, <8 x i32> zeroinitializer)
ret <8 x i32> %res
}
@@ -2914,6 +3091,13 @@ define <8 x i16> @load_v8i16_v8i16(<8 x i16> %trigger, <8 x i16>* %addr, <8 x i1
; AVX512VLBW-NEXT: vpmovw2m %xmm0, %k1
; AVX512VLBW-NEXT: vpblendmw (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v8i16_v8i16:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovw2m %xmm0, %k1
+; X86-AVX512-NEXT: vpblendmw (%eax), %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp slt <8 x i16> %trigger, zeroinitializer
%res = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr, i32 4, <8 x i1> %mask, <8 x i16> %dst)
ret <8 x i16> %res
@@ -3624,6 +3808,13 @@ define <16 x i16> @load_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <1
; AVX512VLBW-NEXT: vpmovw2m %ymm0, %k1
; AVX512VLBW-NEXT: vpblendmw (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v16i16_v16i16:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovw2m %ymm0, %k1
+; X86-AVX512-NEXT: vpblendmw (%eax), %ymm1, %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp slt <16 x i16> %trigger, zeroinitializer
%res = call <16 x i16> @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %addr, i32 4, <16 x i1> %mask, <16 x i16> %dst)
ret <16 x i16> %res
@@ -4322,6 +4513,13 @@ define <16 x i8> @load_v16i8_v16i8(<16 x i8> %trigger, <16 x i8>* %addr, <16 x i
; AVX512VLBW-NEXT: vpmovb2m %xmm0, %k1
; AVX512VLBW-NEXT: vpblendmb (%rdi), %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v16i8_v16i8:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovb2m %xmm0, %k1
+; X86-AVX512-NEXT: vpblendmb (%eax), %xmm1, %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp slt <16 x i8> %trigger, zeroinitializer
%res = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %addr, i32 4, <16 x i1> %mask, <16 x i8> %dst)
ret <16 x i8> %res
@@ -6101,6 +6299,13 @@ define <32 x i8> @load_v32i8_v32i8(<32 x i8> %trigger, <32 x i8>* %addr, <32 x i
; AVX512VLBW-NEXT: vpmovb2m %ymm0, %k1
; AVX512VLBW-NEXT: vpblendmb (%rdi), %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: load_v32i8_v32i8:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovb2m %ymm0, %k1
+; X86-AVX512-NEXT: vpblendmb (%eax), %ymm1, %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp slt <32 x i8> %trigger, zeroinitializer
%res = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %addr, i32 4, <32 x i1> %mask, <32 x i8> %dst)
ret <32 x i8> %res
@@ -6157,6 +6362,14 @@ define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst)
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovups (%rdi), %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v4f32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $13, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovups (%eax), %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
ret <4 x float> %res
}
@@ -6171,6 +6384,12 @@ define <4 x float> @mload_constmask_v4f32_all(<4 x float>* %addr) {
; AVX: ## %bb.0:
; AVX-NEXT: vmovups (%rdi), %xmm0
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v4f32_all:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vmovups (%eax), %xmm0
+; X86-AVX512-NEXT: retl
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float>undef)
ret <4 x float> %res
}
@@ -6185,6 +6404,12 @@ define <2 x double> @mload_constmask_v2f64(<2 x double>* %addr, <2 x double> %ds
; AVX: ## %bb.0:
; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v2f64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; X86-AVX512-NEXT: retl
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x double> %dst)
ret <2 x double> %res
}
@@ -6247,6 +6472,14 @@ define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v4i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $14, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovdqu32 (%eax), %xmm0 {%k1}
+; X86-AVX512-NEXT: retl
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
ret <4 x i32> %res
}
@@ -6267,6 +6500,13 @@ define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
; AVX: ## %bb.0:
; AVX-NEXT: vpinsrq $1, 8(%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v2i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpinsrd $2, 8(%eax), %xmm0, %xmm0
+; X86-AVX512-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0
+; X86-AVX512-NEXT: retl
%res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
ret <2 x i64> %res
}
@@ -6322,6 +6562,14 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst)
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v8f32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $7, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovups (%eax), %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
ret <8 x float> %res
}
@@ -6369,6 +6617,14 @@ define <8 x float> @mload_constmask_v8f32_zero(<8 x float>* %addr, <8 x float> %
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovups (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v8f32_zero:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $7, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovups (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
%res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> zeroinitializer)
ret <8 x float> %res
}
@@ -6409,6 +6665,14 @@ define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %ds
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v4f64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $7, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovupd (%eax), %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
ret <4 x double> %res
}
@@ -6465,6 +6729,14 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu32 (%rdi), %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v8i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $-121, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovdqu32 (%eax), %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
ret <8 x i32> %res
}
@@ -6510,6 +6782,14 @@ define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v4i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $9, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovdqu64 (%eax), %ymm0 {%k1}
+; X86-AVX512-NEXT: retl
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
ret <4 x i64> %res
}
@@ -6550,6 +6830,14 @@ define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %ds
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovupd (%rdi), %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v8f64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $-121, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovupd (%eax), %zmm0 {%k1}
+; X86-AVX512-NEXT: retl
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
ret <8 x double> %res
}
@@ -6613,6 +6901,15 @@ define <16 x double> @mload_constmask_v16f64_allones_split(<16 x double>* %addr,
; AVX512VLBW-NEXT: vmovupd 64(%rdi), %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovups (%rdi), %zmm0
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v16f64_allones_split:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $85, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovupd 64(%eax), %zmm1 {%k1}
+; X86-AVX512-NEXT: vmovups (%eax), %zmm0
+; X86-AVX512-NEXT: retl
%res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %addr, i32 4, <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, <16 x double> %dst)
ret <16 x double> %res
}
@@ -6653,6 +6950,14 @@ define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovupd (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v4f64_undef_passthrough:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $7, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovupd (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
ret <4 x double> %res
}
@@ -6698,6 +7003,14 @@ define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
; AVX512VLBW-NEXT: kmovd %eax, %k1
; AVX512VLBW-NEXT: vmovdqu64 (%rdi), %ymm0 {%k1} {z}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mload_constmask_v4i64_undef_passthrough:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movb $6, %cl
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovdqu64 (%eax), %ymm0 {%k1} {z}
+; X86-AVX512-NEXT: retl
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
ret <4 x i64> %res
}
@@ -6720,6 +7033,12 @@ define <4 x i32> @load_one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
; AVX: ## %bb.0:
; AVX-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: load_one_mask_bit_set1:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpinsrd $0, (%eax), %xmm0, %xmm0
+; X86-AVX512-NEXT: retl
%res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %val)
ret <4 x i32> %res
}
@@ -6743,6 +7062,12 @@ define <4 x float> @load_one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val)
; AVX: ## %bb.0:
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: load_one_mask_bit_set2:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X86-AVX512-NEXT: retl
%res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x float> %val)
ret <4 x float> %res
}
@@ -6780,6 +7105,15 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; AVX512-NEXT: vpinsrq $0, 16(%rdi), %xmm1, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX512-NEXT: retq
+;
+; X86-AVX512-LABEL: load_one_mask_bit_set3:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; X86-AVX512-NEXT: vpinsrd $0, 16(%eax), %xmm1, %xmm1
+; X86-AVX512-NEXT: vpinsrd $1, 20(%eax), %xmm1, %xmm1
+; X86-AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX512-NEXT: retl
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
ret <4 x i64> %res
}
@@ -6798,6 +7132,14 @@ define <4 x double> @load_one_mask_bit_set4(<4 x double>* %addr, <4 x double> %v
; AVX-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: load_one_mask_bit_set4:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-AVX512-NEXT: retl
%res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x double> %val)
ret <4 x double> %res
}
@@ -6823,6 +7165,14 @@ define <8 x double> @load_one_mask_bit_set5(<8 x double>* %addr, <8 x double> %v
; AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
; AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
; AVX512-NEXT: retq
+;
+; X86-AVX512-LABEL: load_one_mask_bit_set5:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm1
+; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; X86-AVX512-NEXT: vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; X86-AVX512-NEXT: retl
%res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x double> %val)
ret <8 x double> %res
}
@@ -6847,6 +7197,17 @@ define i32 @pr38986(i1 %c, i32* %p) {
; AVX-NEXT: movl (%rsi), %eax
; AVX-NEXT: LBB44_2: ## %else
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: pr38986:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: testb $1, {{[0-9]+}}(%esp)
+; X86-AVX512-NEXT: ## implicit-def: $eax
+; X86-AVX512-NEXT: je LBB44_2
+; X86-AVX512-NEXT: ## %bb.1: ## %cond.load
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl (%eax), %eax
+; X86-AVX512-NEXT: LBB44_2: ## %else
+; X86-AVX512-NEXT: retl
%vc = insertelement <1 x i1> undef, i1 %c, i32 0
%vp = bitcast i32* %p to <1 x i32>*
%L = call <1 x i32> @llvm.masked.load.v1i32.p0v1i32 (<1 x i32>* %vp, i32 4, <1 x i1> %vc, <1 x i32> undef)
@@ -6862,6 +7223,10 @@ define <2 x double> @zero_mask(<2 x double>* %addr, <2 x double> %dst) {
; AVX-LABEL: zero_mask:
; AVX: ## %bb.0:
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: zero_mask:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: retl
%res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %addr, i32 4, <2 x i1> zeroinitializer, <2 x double> %dst)
ret <2 x double> %res
}
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 992ef96fd2e8..417463226063 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -6,6 +6,7 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx512f,avx512bw,avx512dq,avx512vl | FileCheck %s --check-prefixes=X86-AVX512
;
; vXf64
@@ -29,6 +30,17 @@ define void @store_v1f64_v1i64(<1 x i64> %trigger, <1 x double>* %addr, <1 x dou
; AVX-NEXT: vmovsd %xmm0, (%rsi)
; AVX-NEXT: LBB0_2: ## %else
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v1f64_v1i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; X86-AVX512-NEXT: jns LBB0_2
+; X86-AVX512-NEXT: ## %bb.1: ## %cond.store
+; X86-AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vmovsd %xmm0, (%eax)
+; X86-AVX512-NEXT: LBB0_2: ## %else
+; X86-AVX512-NEXT: retl
%mask = icmp slt <1 x i64> %trigger, zeroinitializer
call void @llvm.masked.store.v1f64.p0v1f64(<1 x double> %val, <1 x double>* %addr, i32 4, <1 x i1> %mask)
ret void
@@ -82,6 +94,13 @@ define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x dou
; AVX512VLBW-NEXT: vpcmpgtq %xmm0, %xmm2, %k1
; AVX512VLBW-NEXT: vmovupd %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v2f64_v2i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovq2m %xmm0, %k1
+; X86-AVX512-NEXT: vmovupd %xmm1, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp slt <2 x i64> %trigger, zeroinitializer
call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %addr, i32 4, <2 x i1> %mask)
ret void
@@ -153,6 +172,14 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou
; AVX512VLBW-NEXT: vmovupd %ymm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v4f64_v4i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovq2m %ymm0, %k1
+; X86-AVX512-NEXT: vmovupd %ymm1, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
%mask = icmp slt <4 x i64> %trigger, zeroinitializer
call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1> %mask)
ret void
@@ -240,6 +267,15 @@ define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x floa
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v2f32_v2i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0
+; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT: vmovups %xmm1, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
ret void
@@ -344,6 +380,13 @@ define void @store_v4f32_v4i32(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y
; AVX512VLBW-NEXT: vpcmpgtd %xmm2, %xmm1, %k1
; AVX512VLBW-NEXT: vmovups %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v4f32_v4i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovd2m %xmm2, %k1
+; X86-AVX512-NEXT: vmovups %xmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%bool_mask = icmp slt <4 x i32> %mask, zeroinitializer
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %bool_mask)
ret void
@@ -516,6 +559,14 @@ define void @store_v8f32_v8i32(<8 x float> %x, <8 x float>* %ptr, <8 x float> %y
; AVX512VLBW-NEXT: vmovups %ymm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v8f32_v8i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovd2m %ymm2, %k1
+; X86-AVX512-NEXT: vmovups %ymm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
%bool_mask = icmp slt <8 x i32> %mask, zeroinitializer
call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> %x, <8 x float>* %ptr, i32 1, <8 x i1> %bool_mask)
ret void
@@ -813,6 +864,14 @@ define void @store_v16f32_v16i32(<16 x float> %x, <16 x float>* %ptr, <16 x floa
; AVX512VLBW-NEXT: vmovups %zmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v16f32_v16i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovd2m %zmm2, %k1
+; X86-AVX512-NEXT: vmovups %zmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
%bool_mask = icmp slt <16 x i32> %mask, zeroinitializer
call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %x, <16 x float>* %ptr, i32 1, <16 x i1> %bool_mask)
ret void
@@ -894,6 +953,13 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %
; AVX512VLBW-NEXT: vpcmpgtq %xmm0, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu64 %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v2i64_v2i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovq2m %xmm0, %k1
+; X86-AVX512-NEXT: vmovdqu64 %xmm1, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp slt <2 x i64> %trigger, zeroinitializer
call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %val, <2 x i64>* %addr, i32 4, <2 x i1> %mask)
ret void
@@ -1006,6 +1072,14 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %
; AVX512VLBW-NEXT: vmovdqu64 %ymm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v4i64_v4i64:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovq2m %ymm0, %k1
+; X86-AVX512-NEXT: vmovdqu64 %ymm1, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
%mask = icmp slt <4 x i64> %trigger, zeroinitializer
call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1> %mask)
ret void
@@ -1033,6 +1107,17 @@ define void @store_v1i32_v1i32(<1 x i32> %trigger, <1 x i32>* %addr, <1 x i32> %
; AVX-NEXT: movl %edx, (%rsi)
; AVX-NEXT: LBB9_2: ## %else
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v1i32_v1i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; X86-AVX512-NEXT: jne LBB9_2
+; X86-AVX512-NEXT: ## %bb.1: ## %cond.store
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: movl %eax, (%ecx)
+; X86-AVX512-NEXT: LBB9_2: ## %else
+; X86-AVX512-NEXT: retl
%mask = icmp eq <1 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v1i32.p0v1i32(<1 x i32> %val, <1 x i32>* %addr, i32 4, <1 x i1> %mask)
ret void
@@ -1124,6 +1209,15 @@ define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v2i32_v2i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k0
+; X86-AVX512-NEXT: kshiftlb $6, %k0, %k0
+; X86-AVX512-NEXT: kshiftrb $6, %k0, %k1
+; X86-AVX512-NEXT: vmovdqu32 %xmm1, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
ret void
@@ -1231,6 +1325,13 @@ define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v4i32_v4i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vmovdqu32 %xmm1, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
ret void
@@ -1409,6 +1510,14 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %
; AVX512VL-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v8i32_v8i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT: vmovdqu32 %ymm1, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %val, <8 x i32>* %addr, i32 4, <8 x i1> %mask)
ret void
@@ -1753,6 +1862,13 @@ define void @store_v8i16_v8i16(<8 x i16> %trigger, <8 x i16>* %addr, <8 x i16> %
; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vmovdqu16 %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v8i16_v8i16:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmw %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vmovdqu16 %xmm1, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <8 x i16> %trigger, zeroinitializer
call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %val, <8 x i16>* %addr, i32 4, <8 x i1> %mask)
ret void
@@ -2510,6 +2626,14 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <16 x i
; AVX512VLBW-NEXT: vmovdqu16 %ymm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v16i16_v16i16:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmw %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT: vmovdqu16 %ymm1, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
%mask = icmp eq <16 x i16> %trigger, zeroinitializer
call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1> %mask)
ret void
@@ -3104,6 +3228,13 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, <16 x i8>* %addr, <16 x i8> %
; AVX512VLBW-NEXT: vptestnmb %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v16i8_v16i8:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmb %xmm0, %xmm0, %k1
+; X86-AVX512-NEXT: vmovdqu8 %xmm1, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%mask = icmp eq <16 x i8> %trigger, zeroinitializer
call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %val, <16 x i8>* %addr, i32 4, <16 x i1> %mask)
ret void
@@ -4491,6 +4622,14 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, <32 x i8>* %addr, <32 x i8> %
; AVX512VLBW-NEXT: vmovdqu8 %ymm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: store_v32i8_v32i8:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestnmb %ymm0, %ymm0, %k1
+; X86-AVX512-NEXT: vmovdqu8 %ymm1, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
%mask = icmp eq <32 x i8> %trigger, zeroinitializer
call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1> %mask)
ret void
@@ -4508,6 +4647,12 @@ define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr,
; AVX: ## %bb.0:
; AVX-NEXT: vmovups %xmm1, (%rdi)
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: mstore_constmask_v4i32_v4i32:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vmovups %xmm1, (%eax)
+; X86-AVX512-NEXT: retl
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
ret void
@@ -4598,6 +4743,15 @@ define void @mstore_constmask_allones_split(<16 x i32> %trigger, <16 x i32>* %ad
; AVX512VLBW-NEXT: vmovdqu32 %zmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: mstore_constmask_allones_split:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movw $-37, %cx
+; X86-AVX512-NEXT: kmovd %ecx, %k1
+; X86-AVX512-NEXT: vmovdqu32 %zmm1, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
%mask = icmp eq <16 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %val, <16 x i32>* %addr, i32 4, <16 x i1><i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
ret void
@@ -4615,6 +4769,12 @@ define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
; AVX: ## %bb.0:
; AVX-NEXT: vmovss %xmm0, (%rdi)
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: one_mask_bit_set1:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vmovss %xmm0, (%eax)
+; X86-AVX512-NEXT: retl
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
ret void
}
@@ -4637,6 +4797,12 @@ define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
; AVX: ## %bb.0:
; AVX-NEXT: vextractps $2, %xmm0, 8(%rdi)
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: one_mask_bit_set2:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vextractps $2, %xmm0, 8(%eax)
+; X86-AVX512-NEXT: retl
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
}
@@ -4655,6 +4821,14 @@ define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; AVX-NEXT: vmovlps %xmm0, 16(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: one_mask_bit_set3:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-AVX512-NEXT: vmovlps %xmm0, 16(%eax)
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
ret void
}
@@ -4673,6 +4847,14 @@ define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
; AVX-NEXT: vmovhps %xmm0, 24(%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: one_mask_bit_set4:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-AVX512-NEXT: vmovhps %xmm0, 24(%eax)
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
ret void
}
@@ -4698,6 +4880,14 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
; AVX512-NEXT: vmovlps %xmm0, 48(%rdi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
+;
+; X86-AVX512-LABEL: one_mask_bit_set5:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
+; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax)
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
ret void
}
@@ -4783,6 +4973,15 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x doub
; AVX512VLBW-NEXT: vmovupd %ymm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: masked_store_bool_mask_demand_trunc_sext:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpslld $31, %xmm1, %xmm1
+; X86-AVX512-NEXT: vpmovd2m %xmm1, %k1
+; X86-AVX512-NEXT: vmovupd %ymm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
%sext = sext <4 x i32> %masksrc to <4 x i64>
%boolmask = trunc <4 x i64> %sext to <4 x i1>
call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %x, <4 x double>* %p, i32 4, <4 x i1> %boolmask)
@@ -4882,6 +5081,13 @@ define void @one_mask_bit_set1_variable(<4 x float>* %addr, <4 x float> %val, <4
; AVX512VL-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm1, %k1
; AVX512VL-NEXT: vmovups %xmm0, (%rdi) {%k1}
; AVX512VL-NEXT: retq
+;
+; X86-AVX512-LABEL: one_mask_bit_set1_variable:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vptestmd LCPI25_0{1to4}, %xmm1, %k1
+; X86-AVX512-NEXT: vmovups %xmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%mask_signbit = and <4 x i32> %mask, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
%mask_bool = icmp ne <4 x i32> %mask_signbit, zeroinitializer
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 1, <4 x i1> %mask_bool)
@@ -5045,6 +5251,29 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
; AVX512VLBW-NEXT: korw %k1, %k0, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: widen_masked_store:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movb $-3, %al
+; X86-AVX512-NEXT: kmovd %eax, %k0
+; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1
+; X86-AVX512-NEXT: kshiftrb $7, %k1, %k1
+; X86-AVX512-NEXT: kandw %k0, %k1, %k0
+; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1
+; X86-AVX512-NEXT: kshiftrb $6, %k1, %k1
+; X86-AVX512-NEXT: korw %k1, %k0, %k0
+; X86-AVX512-NEXT: movb $-5, %al
+; X86-AVX512-NEXT: kmovd %eax, %k1
+; X86-AVX512-NEXT: kandw %k1, %k0, %k0
+; X86-AVX512-NEXT: kmovb {{[0-9]+}}(%esp), %k1
+; X86-AVX512-NEXT: kshiftlb $7, %k1, %k1
+; X86-AVX512-NEXT: kshiftrb $5, %k1, %k1
+; X86-AVX512-NEXT: korw %k1, %k0, %k1
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vmovdqa32 %xmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)
ret void
}
@@ -5057,6 +5286,10 @@ define void @zero_mask(<2 x double>* %addr, <2 x double> %val) {
; AVX-LABEL: zero_mask:
; AVX: ## %bb.0:
; AVX-NEXT: retq
+;
+; X86-AVX512-LABEL: zero_mask:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: retl
call void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %addr, i32 4, <2 x i1> zeroinitializer)
ret void
}
@@ -5226,6 +5459,14 @@ define void @PR11210(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <2 x i64
; AVX512VLBW-NEXT: vmovups %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
+;
+; X86-AVX512-LABEL: PR11210:
+; X86-AVX512: ## %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: vpmovd2m %xmm2, %k1
+; X86-AVX512-NEXT: vmovups %xmm0, (%eax) {%k1}
+; X86-AVX512-NEXT: vmovups %xmm1, (%eax) {%k1}
+; X86-AVX512-NEXT: retl
%bc = bitcast <2 x i64> %mask to <4 x i32>
%trunc = icmp slt <4 x i32> %bc, zeroinitializer
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %x, <4 x float>* %ptr, i32 1, <4 x i1> %trunc)
More information about the llvm-commits
mailing list