[llvm] 17857d9 - [X86] Generate `kmov` for masking integers (#120593)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 3 07:05:13 PST 2025
Author: Abhishek Kaushik
Date: 2025-03-03T20:35:09+05:30
New Revision: 17857d92416da5997262318a6f62fccad9c5a156
URL: https://github.com/llvm/llvm-project/commit/17857d92416da5997262318a6f62fccad9c5a156
DIFF: https://github.com/llvm/llvm-project/commit/17857d92416da5997262318a6f62fccad9c5a156.diff
LOG: [X86] Generate `kmov` for masking integers (#120593)
When we have an integer used as a bit mask the llvm ir looks something
like this
```
%1 = and <16 x i32> %.splat, <i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 16384, i32 32768>
%cmp1 = icmp ne <16 x i32> %1, zeroinitializer
```
where `.splat` is vector containing the mask in all lanes. The assembly
generated for this looks like
```
vpbroadcastd %ecx, %zmm0
vptestmd .LCPI0_0(%rip), %zmm0, %k1
```
where we have a constant table of powers of 2.
Instead of doing this we could just move the relevant bits directly to
`k` registers using a `kmov` instruction.
```
kmovw %ecx, %k1
```
This is faster and also reduces code size.
Added:
llvm/test/CodeGen/X86/kmov.ll
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/pr78897.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3e23250d861c5..fe170fe3ade08 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55751,6 +55751,88 @@ static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,
return SDValue();
}
+// The pattern (setcc (and (broadcast x), (2^n, 2^{n+1}, ...)), (0, 0, ...),
+// eq/ne) is generated when using an integer as a mask. Instead of generating a
+// broadcast + vptest, we can directly move the integer to a mask register.
+static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
+ const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ if (CC != ISD::SETNE && CC != ISD::SETEQ)
+ return SDValue();
+
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+
+ if (Op0.getOpcode() != ISD::AND)
+ return SDValue();
+
+ SDValue Broadcast = Op0.getOperand(0);
+ if (Broadcast.getOpcode() != X86ISD::VBROADCAST &&
+ Broadcast.getOpcode() != X86ISD::VBROADCAST_LOAD)
+ return SDValue();
+
+ SDValue Load = Op0.getOperand(1);
+ EVT LoadVT = Load.getSimpleValueType();
+
+ APInt UndefElts;
+ SmallVector<APInt, 32> EltBits;
+ if (!getTargetConstantBitsFromNode(Load, LoadVT.getScalarSizeInBits(),
+ UndefElts, EltBits,
+ /*AllowWholeUndefs*/ true,
+ /*AllowPartialUndefs*/ false) ||
+ UndefElts[0] || !EltBits[0].isPowerOf2() || UndefElts.getBitWidth() > 16)
+ return SDValue();
+
+ // Check if the constant pool contains only powers of 2 starting from some
+ // 2^N. The table may also contain undefs because of widening of vector
+ // operands.
+ unsigned N = EltBits[0].logBase2();
+ unsigned Len = UndefElts.getBitWidth();
+ for (unsigned I = 1; I != Len; ++I) {
+ if (UndefElts[I]) {
+ if (!UndefElts.extractBits(Len - (I + 1), I + 1).isAllOnes())
+ return SDValue();
+ break;
+ }
+
+ if (EltBits[I].getBitWidth() <= N + I || !EltBits[I].isOneBitSet(N + I))
+ return SDValue();
+ }
+
+ MVT BroadcastOpVT = Broadcast.getSimpleValueType().getVectorElementType();
+ SDValue BroadcastOp;
+ if (Broadcast.getOpcode() != X86ISD::VBROADCAST) {
+ BroadcastOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, BroadcastOpVT,
+ Broadcast, DAG.getVectorIdxConstant(0, DL));
+ } else {
+ BroadcastOp = Broadcast.getOperand(0);
+ if (BroadcastOp.getValueType().isVector())
+ return SDValue();
+ }
+
+ SDValue Masked = BroadcastOp;
+ if (N != 0) {
+ APInt Mask = APInt::getLowBitsSet(BroadcastOpVT.getSizeInBits(), Len);
+ SDValue ShiftedValue = DAG.getNode(ISD::SRL, DL, BroadcastOpVT, BroadcastOp,
+ DAG.getConstant(N, DL, BroadcastOpVT));
+ Masked = DAG.getNode(ISD::AND, DL, BroadcastOpVT, ShiftedValue,
+ DAG.getConstant(Mask, DL, BroadcastOpVT));
+ }
+ // We can't extract more than 16 bits using this pattern, because 2^{17} will
+ // not fit in an i16 and a vXi32 where X > 16 is more than 512 bits.
+ SDValue Trunc = DAG.getAnyExtOrTrunc(Masked, DL, MVT::i16);
+ SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, MVT::v16i1, Trunc);
+
+ if (CC == ISD::SETEQ)
+ Bitcast = DAG.getNOT(DL, Bitcast, MVT::v16i1);
+
+ if (VT != MVT::v16i1)
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Bitcast,
+ DAG.getVectorIdxConstant(0, DL));
+
+ return Bitcast;
+}
+
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -55883,6 +55965,11 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
"Unexpected condition code!");
return Op0.getOperand(0);
}
+
+ if (IsVZero1)
+ if (SDValue V =
+ combineAVX512SetCCToKMOV(VT, Op0, TmpCC, DL, DAG, Subtarget))
+ return V;
}
// Try and make unsigned vector comparison signed. On pre AVX512 targets there
diff --git a/llvm/test/CodeGen/X86/kmov.ll b/llvm/test/CodeGen/X86/kmov.ll
new file mode 100644
index 0000000000000..55fb2527722a4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/kmov.ll
@@ -0,0 +1,643 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64-AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s --check-prefixes=X64-KNL
+
+define <2 x i1> @i8_mask_extract2(i8 %mask) {
+; X64-AVX512-LABEL: i8_mask_extract2:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: vpmovm2q %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i8_mask_extract2:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2]
+; X64-KNL-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <2 x i8> poison, i8 %mask, i64 0
+ %.splat = shufflevector <2 x i8> %.splatinsert, <2 x i8> poison, <2 x i32> zeroinitializer
+ %1 = and <2 x i8> %.splat, <i8 1, i8 2>
+ %cmp.45 = icmp ne <2 x i8> %1, zeroinitializer
+ ret <2 x i1> %cmp.45
+}
+
+define <2 x i1> @invert_i8_mask_extract2(i8 %mask) {
+; X64-AVX512-LABEL: invert_i8_mask_extract2:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: knotw %k0, %k0
+; X64-AVX512-NEXT: vpmovm2q %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i8_mask_extract2:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <2 x i8> poison, i8 %mask, i64 0
+ %.splat = shufflevector <2 x i8> %.splatinsert, <2 x i8> poison, <2 x i32> zeroinitializer
+ %1 = and <2 x i8> %.splat, <i8 1, i8 2>
+ %cmp.45 = icmp eq <2 x i8> %1, zeroinitializer
+ ret <2 x i1> %cmp.45
+}
+
+define <4 x i1> @i8_mask_extract_4(i8 %mask) {
+; X64-AVX512-LABEL: i8_mask_extract_4:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: vpmovm2d %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i8_mask_extract_4:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,2,4,8,1,2,4,8,1,2,4,8,1,2,4,8]
+; X64-KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <4 x i8> poison, i8 %mask, i64 0
+ %.splat = shufflevector <4 x i8> %.splatinsert, <4 x i8> poison, <4 x i32> zeroinitializer
+ %1 = and <4 x i8> %.splat, <i8 1, i8 2, i8 4, i8 8>
+ %cmp.45 = icmp ne <4 x i8> %1, zeroinitializer
+ ret <4 x i1> %cmp.45
+}
+
+define <4 x i1> @invert_i8_mask_extract_4(i8 %mask) {
+; X64-AVX512-LABEL: invert_i8_mask_extract_4:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: knotw %k0, %k0
+; X64-AVX512-NEXT: vpmovm2d %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i8_mask_extract_4:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <4 x i8> poison, i8 %mask, i64 0
+ %.splat = shufflevector <4 x i8> %.splatinsert, <4 x i8> poison, <4 x i32> zeroinitializer
+ %1 = and <4 x i8> %.splat, <i8 1, i8 2, i8 4, i8 8>
+ %cmp.45 = icmp eq <4 x i8> %1, zeroinitializer
+ ret <4 x i1> %cmp.45
+}
+
+define <8 x i1> @i8_mask_extract_8(i8 %mask) {
+; X64-AVX512-LABEL: i8_mask_extract_8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: vpmovm2w %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i8_mask_extract_8:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
+; X64-KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <8 x i8> poison, i8 %mask, i64 0
+ %.splat = shufflevector <8 x i8> %.splatinsert, <8 x i8> poison, <8 x i32> zeroinitializer
+ %1 = and <8 x i8> %.splat, <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
+ %cmp.45 = icmp ne <8 x i8> %1, zeroinitializer
+ ret <8 x i1> %cmp.45
+}
+
+define <8 x i1> @invert_i8_mask_extract_8(i8 %mask) {
+; X64-AVX512-LABEL: invert_i8_mask_extract_8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: knotb %k0, %k0
+; X64-AVX512-NEXT: vpmovm2w %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i8_mask_extract_8:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastb %xmm0, %xmm0
+; X64-KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <8 x i8> poison, i8 %mask, i64 0
+ %.splat = shufflevector <8 x i8> %.splatinsert, <8 x i8> poison, <8 x i32> zeroinitializer
+ %1 = and <8 x i8> %.splat, <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 128>
+ %cmp.45 = icmp eq <8 x i8> %1, zeroinitializer
+ ret <8 x i1> %cmp.45
+}
+
+define <4 x i1> @i16_mask_extract_4(i16 %mask) {
+; X64-AVX512-LABEL: i16_mask_extract_4:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: vpmovm2d %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i16_mask_extract_4:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastw %xmm0, %xmm0
+; X64-KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,2,4,8,1,2,4,8]
+; X64-KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <4 x i16> poison, i16 %mask, i64 0
+ %.splat = shufflevector <4 x i16> %.splatinsert, <4 x i16> poison, <4 x i32> zeroinitializer
+ %1 = and <4 x i16> %.splat, <i16 1, i16 2, i16 4, i16 8>
+ %cmp.45 = icmp ne <4 x i16> %1, zeroinitializer
+ ret <4 x i1> %cmp.45
+}
+
+define <4 x i1> @invert_i16_mask_extract_4(i16 %mask) {
+; X64-AVX512-LABEL: invert_i16_mask_extract_4:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: knotw %k0, %k0
+; X64-AVX512-NEXT: vpmovm2d %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i16_mask_extract_4:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastw %xmm0, %xmm0
+; X64-KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <4 x i16> poison, i16 %mask, i64 0
+ %.splat = shufflevector <4 x i16> %.splatinsert, <4 x i16> poison, <4 x i32> zeroinitializer
+ %1 = and <4 x i16> %.splat, <i16 1, i16 2, i16 4, i16 8>
+ %cmp.45 = icmp eq <4 x i16> %1, zeroinitializer
+ ret <4 x i1> %cmp.45
+}
+
+define <8 x i1> @i16_mask_extract_8(i16 %mask) {
+; X64-AVX512-LABEL: i16_mask_extract_8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vpbroadcastw %edi, %xmm0
+; X64-AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; X64-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i16_mask_extract_8:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastw %xmm0, %xmm0
+; X64-KNL-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
+; X64-KNL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <8 x i16> poison, i16 %mask, i64 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
+ %1 = and <8 x i16> %.splat, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+ %cmp.45 = icmp ne <8 x i16> %1, zeroinitializer
+ ret <8 x i1> %cmp.45
+}
+
+define <8 x i1> @invert_i16_mask_extract_8(i16 %mask) {
+; X64-AVX512-LABEL: invert_i16_mask_extract_8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vpbroadcastw %edi, %xmm0
+; X64-AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i16_mask_extract_8:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastw %xmm0, %xmm0
+; X64-KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <8 x i16> poison, i16 %mask, i64 0
+ %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer
+ %1 = and <8 x i16> %.splat, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128>
+ %cmp.45 = icmp eq <8 x i16> %1, zeroinitializer
+ ret <8 x i1> %cmp.45
+}
+
+define <16 x i1> @i16_mask_extract_16(i16 %mask) {
+; X64-AVX512-LABEL: i16_mask_extract_16:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i16_mask_extract_16:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; X64-KNL-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
+; X64-KNL-NEXT: vpand %ymm1, %ymm0, %ymm0
+; X64-KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; X64-KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <16 x i16> poison, i16 %mask, i64 0
+ %.splat = shufflevector <16 x i16> %.splatinsert, <16 x i16> poison, <16 x i32> zeroinitializer
+ %1 = and <16 x i16> %.splat, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 8192, i16 16384, i16 32768>
+ %cmp.45 = icmp ne <16 x i16> %1, zeroinitializer
+ ret <16 x i1> %cmp.45
+}
+
+define <16 x i1> @invert_i16_mask_extract_16(i16 %mask) {
+; X64-AVX512-LABEL: invert_i16_mask_extract_16:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: knotw %k0, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i16_mask_extract_16:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: vmovd %edi, %xmm0
+; X64-KNL-NEXT: vpbroadcastw %xmm0, %ymm0
+; X64-KNL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-KNL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
+; X64-KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <16 x i16> poison, i16 %mask, i64 0
+ %.splat = shufflevector <16 x i16> %.splatinsert, <16 x i16> poison, <16 x i32> zeroinitializer
+ %1 = and <16 x i16> %.splat, <i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 512, i16 1024, i16 2048, i16 4096, i16 8192, i16 16384, i16 32768>
+ %cmp.45 = icmp eq <16 x i16> %1, zeroinitializer
+ ret <16 x i1> %cmp.45
+}
+
+define <16 x i1> @i32_mask_extract_16(i32 %mask) {
+; X64-AVX512-LABEL: i32_mask_extract_16:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i32_mask_extract_16:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: kmovw %edi, %k1
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <16 x i32> poison, i32 %mask, i64 0
+ %.splat = shufflevector <16 x i32> %.splatinsert, <16 x i32> poison, <16 x i32> zeroinitializer
+ %1 = and <16 x i32> %.splat, <i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 16384, i32 32768>
+ %cmp.45 = icmp ne <16 x i32> %1, zeroinitializer
+ ret <16 x i1> %cmp.45
+}
+
+define <16 x i1> @invert_i32_mask_extract_16(i32 %mask) {
+; X64-AVX512-LABEL: invert_i32_mask_extract_16:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: knotw %k0, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %xmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i32_mask_extract_16:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: kmovw %edi, %k0
+; X64-KNL-NEXT: knotw %k0, %k1
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <16 x i32> poison, i32 %mask, i64 0
+ %.splat = shufflevector <16 x i32> %.splatinsert, <16 x i32> poison, <16 x i32> zeroinitializer
+ %1 = and <16 x i32> %.splat, <i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 16384, i32 32768>
+ %cmp.45 = icmp eq <16 x i32> %1, zeroinitializer
+ ret <16 x i1> %cmp.45
+}
+
+define <32 x i1> @i32_mask_extract_32(i32 %mask) {
+; X64-AVX512-LABEL: i32_mask_extract_32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: kshiftrd $16, %k0, %k1
+; X64-AVX512-NEXT: kunpckwd %k0, %k1, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %ymm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i32_mask_extract_32:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: kmovw %edi, %k1
+; X64-KNL-NEXT: shrl $16, %edi
+; X64-KNL-NEXT: kmovw %edi, %k2
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm1, %xmm1
+; X64-KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <32 x i32> poison, i32 %mask, i64 0
+ %.splat = shufflevector <32 x i32> %.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+ %1 = and <32 x i32> %.splat, <i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 16384, i32 32768, i32 65536, i32 131072, i32 262144, i32 524288, i32 1048576, i32 2097152, i32 4194304, i32 8388608, i32 16777216, i32 33554432, i32 67108864, i32 134217728, i32 268435456, i32 536870912, i32 1073741824, i32 2147483648>
+ %cmp.45 = icmp ne <32 x i32> %1, zeroinitializer
+ ret <32 x i1> %cmp.45
+}
+
+define <32 x i1> @invert_i32_mask_extract_32(i32 %mask) {
+; X64-AVX512-LABEL: invert_i32_mask_extract_32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: kshiftrd $16, %k0, %k1
+; X64-AVX512-NEXT: kunpckwd %k0, %k1, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %ymm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i32_mask_extract_32:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: kmovw %edi, %k1
+; X64-KNL-NEXT: shrl $16, %edi
+; X64-KNL-NEXT: kmovw %edi, %k2
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm1, %xmm1
+; X64-KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <32 x i32> poison, i32 %mask, i64 0
+ %.splat = shufflevector <32 x i32> %.splatinsert, <32 x i32> poison, <32 x i32> zeroinitializer
+ %1 = and <32 x i32> %.splat, <i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64, i32 128, i32 256, i32 512, i32 1024, i32 2048, i32 4096, i32 8192, i32 16384, i32 32768, i32 65536, i32 131072, i32 262144, i32 524288, i32 1048576, i32 2097152, i32 4194304, i32 8388608, i32 16777216, i32 33554432, i32 67108864, i32 134217728, i32 268435456, i32 536870912, i32 1073741824, i32 2147483648>
+ %cmp.45 = icmp ne <32 x i32> %1, zeroinitializer
+ ret <32 x i1> %cmp.45
+}
+
+define <32 x i1> @i64_mask_extract_32(i64 %mask) {
+; X64-AVX512-LABEL: i64_mask_extract_32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: movq %rdi, %rax
+; X64-AVX512-NEXT: kmovd %eax, %k0
+; X64-AVX512-NEXT: movzbl %ah, %ecx
+; X64-AVX512-NEXT: kmovd %ecx, %k1
+; X64-AVX512-NEXT: kunpckbw %k0, %k1, %k0
+; X64-AVX512-NEXT: movl %eax, %ecx
+; X64-AVX512-NEXT: shrl $24, %ecx
+; X64-AVX512-NEXT: kmovd %ecx, %k1
+; X64-AVX512-NEXT: shrl $16, %eax
+; X64-AVX512-NEXT: movzbl %al, %eax
+; X64-AVX512-NEXT: kmovd %eax, %k2
+; X64-AVX512-NEXT: kunpckbw %k2, %k1, %k1
+; X64-AVX512-NEXT: kunpckwd %k0, %k1, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %ymm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i64_mask_extract_32:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: movq %rdi, %rax
+; X64-KNL-NEXT: movl %eax, %ecx
+; X64-KNL-NEXT: kmovw %eax, %k0
+; X64-KNL-NEXT: movzbl %ah, %edx
+; X64-KNL-NEXT: # kill: def $eax killed $eax killed $rax
+; X64-KNL-NEXT: shrl $24, %eax
+; X64-KNL-NEXT: kmovw %eax, %k1
+; X64-KNL-NEXT: shrl $16, %ecx
+; X64-KNL-NEXT: movzbl %cl, %eax
+; X64-KNL-NEXT: kmovw %eax, %k2
+; X64-KNL-NEXT: kunpckbw %k2, %k1, %k1
+; X64-KNL-NEXT: kmovw %edx, %k2
+; X64-KNL-NEXT: kunpckbw %k0, %k2, %k2
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm1, %xmm1
+; X64-KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <32 x i64> poison, i64 %mask, i64 0
+ %.splat = shufflevector <32 x i64> %.splatinsert, <32 x i64> poison, <32 x i32> zeroinitializer
+ %1 = and <32 x i64> %.splat, <i64 1, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256, i64 512, i64 1024, i64 2048, i64 4096, i64 8192, i64 16384, i64 32768, i64 65536, i64 131072, i64 262144, i64 524288, i64 1048576, i64 2097152, i64 4194304, i64 8388608, i64 16777216, i64 33554432, i64 67108864, i64 134217728, i64 268435456, i64 536870912, i64 1073741824, i64 2147483648>
+ %cmp.45 = icmp ne <32 x i64> %1, zeroinitializer
+ ret <32 x i1> %cmp.45
+}
+
+define <32 x i1> @invert_i64_mask_extract_32(i64 %mask) {
+; X64-AVX512-LABEL: invert_i64_mask_extract_32:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovq %rdi, %k0
+; X64-AVX512-NEXT: knotb %k0, %k1
+; X64-AVX512-NEXT: kshiftrd $8, %k0, %k2
+; X64-AVX512-NEXT: knotb %k2, %k2
+; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1
+; X64-AVX512-NEXT: kshiftrd $16, %k0, %k2
+; X64-AVX512-NEXT: knotb %k2, %k2
+; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0
+; X64-AVX512-NEXT: knotb %k0, %k0
+; X64-AVX512-NEXT: kunpckbw %k2, %k0, %k0
+; X64-AVX512-NEXT: kunpckwd %k1, %k0, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %ymm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i64_mask_extract_32:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: movl %edi, %eax
+; X64-KNL-NEXT: shrl $16, %eax
+; X64-KNL-NEXT: kmovw %eax, %k0
+; X64-KNL-NEXT: knotw %k0, %k0
+; X64-KNL-NEXT: movl %edi, %eax
+; X64-KNL-NEXT: shrl $24, %eax
+; X64-KNL-NEXT: kmovw %eax, %k1
+; X64-KNL-NEXT: knotw %k1, %k1
+; X64-KNL-NEXT: kunpckbw %k0, %k1, %k1
+; X64-KNL-NEXT: kmovw %edi, %k0
+; X64-KNL-NEXT: knotw %k0, %k0
+; X64-KNL-NEXT: shrl $8, %edi
+; X64-KNL-NEXT: kmovw %edi, %k2
+; X64-KNL-NEXT: knotw %k2, %k2
+; X64-KNL-NEXT: kunpckbw %k0, %k2, %k2
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm0, %xmm0
+; X64-KNL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
+; X64-KNL-NEXT: vpmovdb %zmm1, %xmm1
+; X64-KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <32 x i64> poison, i64 %mask, i64 0
+ %.splat = shufflevector <32 x i64> %.splatinsert, <32 x i64> poison, <32 x i32> zeroinitializer
+ %1 = and <32 x i64> %.splat, <i64 1, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256, i64 512, i64 1024, i64 2048, i64 4096, i64 8192, i64 16384, i64 32768, i64 65536, i64 131072, i64 262144, i64 524288, i64 1048576, i64 2097152, i64 4194304, i64 8388608, i64 16777216, i64 33554432, i64 67108864, i64 134217728, i64 268435456, i64 536870912, i64 1073741824, i64 2147483648>
+ %cmp.45 = icmp eq <32 x i64> %1, zeroinitializer
+ ret <32 x i1> %cmp.45
+}
+
+define <64 x i1> @i64_mask_extract_64(i64 %mask) {
+; X64-AVX512-LABEL: i64_mask_extract_64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: movq %rdi, %rax
+; X64-AVX512-NEXT: kmovd %eax, %k0
+; X64-AVX512-NEXT: movzbl %ah, %ecx
+; X64-AVX512-NEXT: kmovd %ecx, %k1
+; X64-AVX512-NEXT: kunpckbw %k0, %k1, %k0
+; X64-AVX512-NEXT: movl %eax, %ecx
+; X64-AVX512-NEXT: shrl $24, %ecx
+; X64-AVX512-NEXT: kmovd %ecx, %k1
+; X64-AVX512-NEXT: movl %eax, %ecx
+; X64-AVX512-NEXT: shrl $16, %ecx
+; X64-AVX512-NEXT: movzbl %cl, %ecx
+; X64-AVX512-NEXT: kmovd %ecx, %k2
+; X64-AVX512-NEXT: kunpckbw %k2, %k1, %k1
+; X64-AVX512-NEXT: kunpckwd %k0, %k1, %k0
+; X64-AVX512-NEXT: movq %rdi, %rcx
+; X64-AVX512-NEXT: shrq $32, %rcx
+; X64-AVX512-NEXT: movzbl %cl, %ecx
+; X64-AVX512-NEXT: kmovd %ecx, %k1
+; X64-AVX512-NEXT: movq %rdi, %rcx
+; X64-AVX512-NEXT: shrq $40, %rcx
+; X64-AVX512-NEXT: movzbl %cl, %ecx
+; X64-AVX512-NEXT: kmovd %ecx, %k2
+; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1
+; X64-AVX512-NEXT: movq %rdi, %rcx
+; X64-AVX512-NEXT: shrq $56, %rcx
+; X64-AVX512-NEXT: kmovd %ecx, %k2
+; X64-AVX512-NEXT: shrq $48, %rax
+; X64-AVX512-NEXT: movzbl %al, %eax
+; X64-AVX512-NEXT: kmovd %eax, %k3
+; X64-AVX512-NEXT: kunpckbw %k3, %k2, %k2
+; X64-AVX512-NEXT: kunpckwd %k1, %k2, %k1
+; X64-AVX512-NEXT: kunpckdq %k0, %k1, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %zmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: i64_mask_extract_64:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: pushq %rbx
+; X64-KNL-NEXT: .cfi_def_cfa_offset 16
+; X64-KNL-NEXT: .cfi_offset %rbx, -16
+; X64-KNL-NEXT: movq %rsi, %rcx
+; X64-KNL-NEXT: movq %rdi, %rax
+; X64-KNL-NEXT: movl %ecx, %edx
+; X64-KNL-NEXT: movq %rsi, %rdi
+; X64-KNL-NEXT: movq %rsi, %r8
+; X64-KNL-NEXT: movq %rsi, %r9
+; X64-KNL-NEXT: kmovw %ecx, %k0
+; X64-KNL-NEXT: movzbl %ch, %ebx
+; X64-KNL-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; X64-KNL-NEXT: shrl $24, %ecx
+; X64-KNL-NEXT: kmovw %ecx, %k1
+; X64-KNL-NEXT: shrl $16, %edx
+; X64-KNL-NEXT: movzbl %dl, %ecx
+; X64-KNL-NEXT: kmovw %ecx, %k2
+; X64-KNL-NEXT: shrq $32, %rsi
+; X64-KNL-NEXT: movzbl %sil, %ecx
+; X64-KNL-NEXT: kmovw %ecx, %k3
+; X64-KNL-NEXT: shrq $40, %rdi
+; X64-KNL-NEXT: movzbl %dil, %ecx
+; X64-KNL-NEXT: kmovw %ecx, %k4
+; X64-KNL-NEXT: kunpckbw %k2, %k1, %k1
+; X64-KNL-NEXT: shrq $56, %r8
+; X64-KNL-NEXT: kmovw %r8d, %k2
+; X64-KNL-NEXT: kunpckbw %k3, %k4, %k3
+; X64-KNL-NEXT: shrq $48, %r9
+; X64-KNL-NEXT: movzbl %r9b, %ecx
+; X64-KNL-NEXT: kmovw %ecx, %k4
+; X64-KNL-NEXT: kunpckbw %k4, %k2, %k2
+; X64-KNL-NEXT: kmovw %ebx, %k4
+; X64-KNL-NEXT: kunpckbw %k0, %k4, %k0
+; X64-KNL-NEXT: kmovw %k0, (%rax)
+; X64-KNL-NEXT: kmovw %k2, 6(%rax)
+; X64-KNL-NEXT: kmovw %k3, 4(%rax)
+; X64-KNL-NEXT: kmovw %k1, 2(%rax)
+; X64-KNL-NEXT: popq %rbx
+; X64-KNL-NEXT: .cfi_def_cfa_offset 8
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <64 x i64> poison, i64 %mask, i64 0
+ %.splat = shufflevector <64 x i64> %.splatinsert, <64 x i64> poison, <64 x i32> zeroinitializer
+ %1 = and <64 x i64> %.splat, <i64 1, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256, i64 512, i64 1024, i64 2048, i64 4096, i64 8192, i64 16384, i64 32768, i64 65536, i64 131072, i64 262144, i64 524288, i64 1048576, i64 2097152, i64 4194304, i64 8388608, i64 16777216, i64 33554432, i64 67108864, i64 134217728, i64 268435456, i64 536870912, i64 1073741824, i64 2147483648, i64 4294967296, i64 8589934592, i64 17179869184, i64 34359738368, i64 68719476736, i64 137438953472, i64 274877906944, i64 549755813888, i64 1099511627776, i64 2199023255552, i64 4398046511104, i64 8796093022208, i64 17592186044416, i64 35184372088832, i64 70368744177664, i64 140737488355328, i64 281474976710656, i64 562949953421312, i64 1125899906842624, i64 2251799813685248, i64 4503599627370496, i64 9007199254740992, i64 18014398509481984, i64 36028797018963968, i64 72057594037927936, i64 144115188075855872, i64 288230376151711744, i64 576460752303423488, i64 1152921504606846976, i64 2305843009213693952, i64 4611686018427387904, i64 9223372036854775808>
+ %cmp.45 = icmp ne <64 x i64> %1, zeroinitializer
+ ret <64 x i1> %cmp.45
+}
+
+define <64 x i1> @invert_i64_mask_extract_64(i64 %mask) {
+; X64-AVX512-LABEL: invert_i64_mask_extract_64:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: kmovq %rdi, %k0
+; X64-AVX512-NEXT: kshiftrq $32, %k0, %k1
+; X64-AVX512-NEXT: knotb %k1, %k1
+; X64-AVX512-NEXT: kshiftrq $40, %k0, %k2
+; X64-AVX512-NEXT: knotb %k2, %k2
+; X64-AVX512-NEXT: kunpckbw %k1, %k2, %k1
+; X64-AVX512-NEXT: kshiftrq $48, %k0, %k2
+; X64-AVX512-NEXT: knotb %k2, %k2
+; X64-AVX512-NEXT: kshiftrq $56, %k0, %k3
+; X64-AVX512-NEXT: knotb %k3, %k3
+; X64-AVX512-NEXT: kunpckbw %k2, %k3, %k2
+; X64-AVX512-NEXT: kunpckwd %k1, %k2, %k1
+; X64-AVX512-NEXT: knotb %k0, %k2
+; X64-AVX512-NEXT: kshiftrd $8, %k0, %k3
+; X64-AVX512-NEXT: knotb %k3, %k3
+; X64-AVX512-NEXT: kunpckbw %k2, %k3, %k2
+; X64-AVX512-NEXT: kshiftrd $16, %k0, %k3
+; X64-AVX512-NEXT: knotb %k3, %k3
+; X64-AVX512-NEXT: kshiftrd $24, %k0, %k0
+; X64-AVX512-NEXT: knotb %k0, %k0
+; X64-AVX512-NEXT: kunpckbw %k3, %k0, %k0
+; X64-AVX512-NEXT: kunpckwd %k2, %k0, %k0
+; X64-AVX512-NEXT: kunpckdq %k0, %k1, %k0
+; X64-AVX512-NEXT: vpmovm2b %k0, %zmm0
+; X64-AVX512-NEXT: retq
+;
+; X64-KNL-LABEL: invert_i64_mask_extract_64:
+; X64-KNL: # %bb.0:
+; X64-KNL-NEXT: movq %rdi, %rax
+; X64-KNL-NEXT: kmovw %esi, %k0
+; X64-KNL-NEXT: knotw %k0, %k0
+; X64-KNL-NEXT: movl %esi, %ecx
+; X64-KNL-NEXT: shrl $8, %ecx
+; X64-KNL-NEXT: kmovw %ecx, %k1
+; X64-KNL-NEXT: knotw %k1, %k1
+; X64-KNL-NEXT: kunpckbw %k0, %k1, %k0
+; X64-KNL-NEXT: movl %esi, %ecx
+; X64-KNL-NEXT: shrl $16, %ecx
+; X64-KNL-NEXT: kmovw %ecx, %k1
+; X64-KNL-NEXT: knotw %k1, %k1
+; X64-KNL-NEXT: movl %esi, %ecx
+; X64-KNL-NEXT: shrl $24, %ecx
+; X64-KNL-NEXT: kmovw %ecx, %k2
+; X64-KNL-NEXT: knotw %k2, %k2
+; X64-KNL-NEXT: kunpckbw %k1, %k2, %k1
+; X64-KNL-NEXT: movq %rsi, %rcx
+; X64-KNL-NEXT: shrq $32, %rcx
+; X64-KNL-NEXT: kmovw %ecx, %k2
+; X64-KNL-NEXT: knotw %k2, %k2
+; X64-KNL-NEXT: movq %rsi, %rcx
+; X64-KNL-NEXT: shrq $40, %rcx
+; X64-KNL-NEXT: kmovw %ecx, %k3
+; X64-KNL-NEXT: knotw %k3, %k3
+; X64-KNL-NEXT: kunpckbw %k2, %k3, %k2
+; X64-KNL-NEXT: movq %rsi, %rcx
+; X64-KNL-NEXT: shrq $48, %rcx
+; X64-KNL-NEXT: kmovw %ecx, %k3
+; X64-KNL-NEXT: knotw %k3, %k3
+; X64-KNL-NEXT: shrq $56, %rsi
+; X64-KNL-NEXT: kmovw %esi, %k4
+; X64-KNL-NEXT: knotw %k4, %k4
+; X64-KNL-NEXT: kunpckbw %k3, %k4, %k3
+; X64-KNL-NEXT: kmovw %k3, 6(%rdi)
+; X64-KNL-NEXT: kmovw %k2, 4(%rdi)
+; X64-KNL-NEXT: kmovw %k1, 2(%rdi)
+; X64-KNL-NEXT: kmovw %k0, (%rdi)
+; X64-KNL-NEXT: retq
+ %.splatinsert = insertelement <64 x i64> poison, i64 %mask, i64 0
+ %.splat = shufflevector <64 x i64> %.splatinsert, <64 x i64> poison, <64 x i32> zeroinitializer
+ %1 = and <64 x i64> %.splat, <i64 1, i64 2, i64 4, i64 8, i64 16, i64 32, i64 64, i64 128, i64 256, i64 512, i64 1024, i64 2048, i64 4096, i64 8192, i64 16384, i64 32768, i64 65536, i64 131072, i64 262144, i64 524288, i64 1048576, i64 2097152, i64 4194304, i64 8388608, i64 16777216, i64 33554432, i64 67108864, i64 134217728, i64 268435456, i64 536870912, i64 1073741824, i64 2147483648, i64 4294967296, i64 8589934592, i64 17179869184, i64 34359738368, i64 68719476736, i64 137438953472, i64 274877906944, i64 549755813888, i64 1099511627776, i64 2199023255552, i64 4398046511104, i64 8796093022208, i64 17592186044416, i64 35184372088832, i64 70368744177664, i64 140737488355328, i64 281474976710656, i64 562949953421312, i64 1125899906842624, i64 2251799813685248, i64 4503599627370496, i64 9007199254740992, i64 18014398509481984, i64 36028797018963968, i64 72057594037927936, i64 144115188075855872, i64 288230376151711744, i64 576460752303423488, i64 1152921504606846976, i64 2305843009213693952, i64 4611686018427387904, i64 9223372036854775808>
+ %cmp.45 = icmp eq <64 x i64> %1, zeroinitializer
+ ret <64 x i1> %cmp.45
+}
+
diff --git a/llvm/test/CodeGen/X86/pr78897.ll b/llvm/test/CodeGen/X86/pr78897.ll
index 56e4ec2bc8ecb..0caa569107c0c 100644
--- a/llvm/test/CodeGen/X86/pr78897.ll
+++ b/llvm/test/CodeGen/X86/pr78897.ll
@@ -224,7 +224,9 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
; X86-AVX512-NEXT: pushl %edi
; X86-AVX512-NEXT: pushl %esi
; X86-AVX512-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm0
-; X86-AVX512-NEXT: vptestnmb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %k1
+; X86-AVX512-NEXT: vmovd %xmm0, %eax
+; X86-AVX512-NEXT: kmovd %eax, %k0
+; X86-AVX512-NEXT: knotw %k0, %k1
; X86-AVX512-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
; X86-AVX512-NEXT: vpextrd $1, %xmm0, %eax
; X86-AVX512-NEXT: vmovd %xmm0, %edx
@@ -256,8 +258,8 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
;
; X64-AVX512-LABEL: produceShuffleVectorForByte:
; X64-AVX512: # %bb.0: # %entry
-; X64-AVX512-NEXT: vpbroadcastb %edi, %xmm0
-; X64-AVX512-NEXT: vptestnmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
+; X64-AVX512-NEXT: kmovd %edi, %k0
+; X64-AVX512-NEXT: knotw %k0, %k1
; X64-AVX512-NEXT: vmovdqu8 {{.*#+}} xmm0 {%k1} {z} = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
; X64-AVX512-NEXT: vmovq %xmm0, %rax
; X64-AVX512-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110
More information about the llvm-commits
mailing list