[llvm] [X86] matchUnaryShuffle - add support for matching 512-bit extension patterns. (PR #127643)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 19 00:48:47 PST 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/127643
>From ad480c232ac75f1aed7061c7317554a443644707 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 18 Feb 2025 14:37:28 +0000
Subject: [PATCH] [X86] matchUnaryShuffle - add support for matching 512-bit
extension patterns.
Handles both BWI and non-BWI cases (skips PMOV*XBW without BWI).
The vector-interleaved-store-i8-stride-8.ll VPTERNLOG diffs are due to better value tracking now recognising the zero-extension patterns where before it was any-extension
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +-
.../vector-interleaved-store-i8-stride-8.ll | 32 +-
.../CodeGen/X86/vector-replicaton-i1-mask.ll | 360 +++++++-----------
3 files changed, 170 insertions(+), 232 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 386d56dcda9de..4d42d8d42eb09 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -38927,13 +38927,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
// Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
- // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
- if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
- (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
+ if (AllowIntDomain &&
+ ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasInt256()) ||
+ (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) {
unsigned MaxScale = 64 / MaskEltSize;
bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
DAG.ComputeNumSignBits(V1) == MaskEltSize;
for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
+ // Skip 512-bit VPMOV?XBW on non-AVX512BW targets.
+ if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs())
+ continue;
bool MatchAny = true;
bool MatchZero = true;
bool MatchSign = UseSign;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index ba51c65ccab13..251139161e46f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -6905,7 +6905,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0
; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11)
; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
@@ -6927,7 +6927,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
; AVX512-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11)
; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
@@ -6944,7 +6944,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0
; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11)
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5]
; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -6968,7 +6968,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11)
; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5]
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
@@ -7035,7 +7035,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11)
; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero
; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero
@@ -7057,7 +7057,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11)
; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5]
; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
@@ -7070,7 +7070,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11)
; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5]
; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9
@@ -7083,7 +7083,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11)
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5]
; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
@@ -7589,7 +7589,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0
; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11)
; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
@@ -7611,7 +7611,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11)
; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
@@ -7628,7 +7628,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11)
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -7652,7 +7652,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11)
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
@@ -7719,7 +7719,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11)
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero
@@ -7741,7 +7741,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11)
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14
@@ -7754,7 +7754,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11)
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9
@@ -7767,7 +7767,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11)
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index a8df418143f32..717d1e447e165 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -98,8 +98,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512F-ONLY: # %bb.0:
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -110,8 +109,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovb (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -122,8 +120,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovsxdq %ymm0, %zmm0
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -145,8 +142,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
@@ -162,8 +158,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
@@ -176,8 +171,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
@@ -200,21 +194,20 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1
; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2
; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
-; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2
-; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
-; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
-; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
+; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm1, %zmm1
; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0
+; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3
+; AVX512F-ONLY-NEXT: vpmovsxdq %ymm2, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
-; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
-; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
-; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
@@ -227,21 +220,20 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: kmovw 2(%rdi), %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2
+; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2
+; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3
+; AVX512DQ-NEXT: vpmovsxdq %ymm2, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
-; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z}
-; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
-; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
-; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
+; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k1} {z}
+; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx)
@@ -249,47 +241,25 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32:
-; AVX512BW-ONLY: # %bb.0:
-; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
-; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
-; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
-; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
-; AVX512BW-ONLY-NEXT: vzeroupper
-; AVX512BW-ONLY-NEXT: retq
-;
-; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32:
-; AVX512VBMI-ONLY: # %bb.0:
-; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
-; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
-; AVX512VBMI-ONLY-NEXT: vzeroupper
-; AVX512VBMI-ONLY-NEXT: retq
+; AVX512BW-LABEL: mask_replication_factor2_vf32:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: kmovq (%rdi), %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
+; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k2
+; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
+; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
%tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3, i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7, i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15, i32 16, i32 16, i32 17, i32 17, i32 18, i32 18, i32 19, i32 19, i32 20, i32 20, i32 21, i32 21, i32 22, i32 22, i32 23, i32 23, i32 24, i32 24, i32 25, i32 25, i32 26, i32 26, i32 27, i32 27, i32 28, i32 28, i32 29, i32 29, i32 30, i32 30, i32 31, i32 31>
@@ -301,42 +271,41 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind {
; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64:
; AVX512F-ONLY: # %bb.0:
-; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3
-; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k5
-; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4
+; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4
+; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k3
+; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k2
; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1
; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1
; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
-; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2
-; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3
-; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k4
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm1, %zmm3
+; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2
+; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1
+; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm4
+; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3
+; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm4 {%k4} {z} = -1
+; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm1
+; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5
+; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0
+; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4
+; AVX512F-ONLY-NEXT: vpmovsxdq %ymm2, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6
-; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k5} {z} = -1
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3
-; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k5
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT: vpmovsxdq %ymm3, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7
-; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k3} {z} = -1
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1
-; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3
-; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512F-ONLY-NEXT: vpmovsxdq %ymm4, %zmm0
; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1
-; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z}
-; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z}
-; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
-; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z}
-; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z}
-; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k7} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k6} {z}
; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z}
+; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k4} {z}
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx)
@@ -351,41 +320,40 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512DQ-LABEL: mask_replication_factor2_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: kmovw (%rdi), %k0
-; AVX512DQ-NEXT: kmovw 2(%rdi), %k5
-; AVX512DQ-NEXT: kmovw 4(%rdi), %k3
+; AVX512DQ-NEXT: kmovw 2(%rdi), %k3
+; AVX512DQ-NEXT: kmovw 4(%rdi), %k2
; AVX512DQ-NEXT: kmovw 6(%rdi), %k1
; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2
; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
-; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2
-; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3
-; AVX512DQ-NEXT: vpmovd2m %zmm3, %k3
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT: vpmovm2d %k2, %zmm2
+; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm3
+; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2
+; AVX512DQ-NEXT: vpmovm2d %k3, %zmm3
+; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm4
+; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3
+; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4
+; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5
+; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4
-; AVX512DQ-NEXT: vpmovm2d %k5, %zmm0
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3
-; AVX512DQ-NEXT: vpmovd2m %zmm3, %k5
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT: vpmovsxdq %ymm2, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6
-; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7
-; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT: vpmovsxdq %ymm3, %zmm0
+; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7
+; AVX512DQ-NEXT: vpmovsxdq %ymm4, %zmm0
; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1
-; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
-; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z}
-; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z}
-; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z}
-; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
-; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
-; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
+; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k7} {z}
+; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
+; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k6} {z}
; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
+; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z}
+; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k4} {z}
; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx)
@@ -402,12 +370,9 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7]
-; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
-; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
+; AVX512BW-ONLY-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
@@ -441,8 +406,7 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT: vpmovsxbw %ymm0, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
@@ -2525,8 +2489,7 @@ define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7]
-; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovsxwq %xmm0, %zmm0
; AVX512BW-NEXT: vpmovw2m %zmm0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
@@ -2598,47 +2561,25 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16:
-; AVX512BW-ONLY: # %bb.0:
-; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0
-; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
-; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
-; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
-; AVX512BW-ONLY-NEXT: vzeroupper
-; AVX512BW-ONLY-NEXT: retq
-;
-; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16:
-; AVX512VBMI-ONLY: # %bb.0:
-; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
-; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
-; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx)
-; AVX512VBMI-ONLY-NEXT: vzeroupper
-; AVX512VBMI-ONLY-NEXT: retq
+; AVX512BW-LABEL: mask_replication_factor4_vf16:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: kmovq (%rdi), %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0
+; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
+; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k2
+; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
+; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
%src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7, i32 7, i32 8, i32 8, i32 8, i32 8, i32 9, i32 9, i32 9, i32 9, i32 10, i32 10, i32 10, i32 10, i32 11, i32 11, i32 11, i32 11, i32 12, i32 12, i32 12, i32 12, i32 13, i32 13, i32 13, i32 13, i32 14, i32 14, i32 14, i32 14, i32 15, i32 15, i32 15, i32 15>
@@ -2747,11 +2688,9 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0
; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
-; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
-; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
+; AVX512BW-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
@@ -2785,8 +2724,7 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
-; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
@@ -2997,8 +2935,7 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3
-; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
+; AVX512BW-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4
; AVX512BW-ONLY-NEXT: kshiftrq $16, %k4, %k5
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
@@ -3060,8 +2997,7 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
-; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4
; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k4, %k5
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
@@ -12956,8 +12892,7 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512VBMI-ONLY: # %bb.0:
; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
-; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-ONLY-NEXT: vpmovsxbq %xmm0, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
@@ -13083,10 +13018,10 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: kmovw (%rdi), %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63]
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
+; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k2
; AVX512BW-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
@@ -13291,13 +13226,12 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3
; AVX512BW-NEXT: vpmovb2m %zmm3, %k1
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2
+; AVX512BW-NEXT: vpmovb2m %zmm2, %k3
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k2
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vpmovb2m %zmm1, %k3
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k4
; AVX512BW-NEXT: kshiftrq $16, %k4, %k5
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
@@ -13680,16 +13614,16 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm7
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm10
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm15
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm16
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
-; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm11
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
-; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1]
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k2
; AVX512BW-NEXT: kshiftrq $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
@@ -13710,9 +13644,9 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrq $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $32, %k2, %k1
-; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
-; AVX512BW-NEXT: vpmovb2m %zmm10, %k1
-; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z}
+; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z}
+; AVX512BW-NEXT: vpmovb2m %zmm11, %k1
+; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm11 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $48, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
@@ -13735,8 +13669,8 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z}
; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z}
-; AVX512BW-NEXT: vpmovb2m %zmm12, %k2
-; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z}
+; AVX512BW-NEXT: vpmovb2m %zmm10, %k2
+; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $16, %k2, %k1
@@ -13765,7 +13699,7 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vmovdqa64 %zmm24, 1472(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm23, 1408(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm22, 1344(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm12, 1280(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm10, 1280(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx)
@@ -13775,9 +13709,9 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm16, 768(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm12, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm11, 512(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx)
More information about the llvm-commits
mailing list