[llvm] [X86] combineKSHIFT - fold kshiftr(kshiftr/extract_subvector(X, C1), C2) --> kshiftr(X, C1+C2) (PR #115528)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 8 10:11:29 PST 2024
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/115528
Merge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel.
>From cc543bb93d6c9669a7d73b6a6a4165bf7f13d1db Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 8 Nov 2024 18:10:09 +0000
Subject: [PATCH] [X86] combineKSHIFT - fold
kshiftr(kshiftr/extract_subvector(X,C1),C2) --> kshiftr(X,C1+C2)
Merge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +-
llvm/test/CodeGen/X86/avx512-bugfix-26264.ll | 16 +-
.../CodeGen/X86/avx512-masked-memop-64-32.ll | 8 +-
llvm/test/CodeGen/X86/pr33349.ll | 24 +-
llvm/test/CodeGen/X86/pr34177.ll | 12 +-
llvm/test/CodeGen/X86/vec_smulo.ll | 8 +-
llvm/test/CodeGen/X86/vec_umulo.ll | 8 +-
llvm/test/CodeGen/X86/vector-compress.ll | 8 +-
.../CodeGen/X86/vector-replicaton-i1-mask.ll | 290 +++++++++---------
9 files changed, 208 insertions(+), 189 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 19a85a6d7ec6ce..748f885e3f8d90 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58405,11 +58405,30 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
-
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
return DAG.getConstant(0, SDLoc(N), VT);
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ // Fold kshiftr(extract_subvector(X,C1),C2)
+ // --> extract_subvector(kshiftr(X,C1+C2),0)
+ // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
+ if (N->getOpcode() == X86ISD::KSHIFTR) {
+ SDLoc DL(N);
+ if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
+ N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
+ SDValue Src = N->getOperand(0).getOperand(0);
+ uint64_t Amt = N->getConstantOperandVal(1) +
+ N->getOperand(0).getConstantOperandVal(1);
+ EVT SrcVT = Src.getValueType();
+ if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
+ SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
+ DAG.getTargetConstant(Amt, DL, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
+ DAG.getIntPtrConstant(0, DL));
+ }
+ }
+ }
+
APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
return SDValue(N, 0);
diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
index 537f42dd9c2c59..e0f3b6c4ec90a4 100644
--- a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
+++ b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
@@ -7,11 +7,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k2
+; AVX512BW-NEXT: kshiftrd $8, %k1, %k2
; AVX512BW-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
-; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
+; AVX512BW-NEXT: kshiftrd $24, %k1, %k1
; AVX512BW-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
; AVX512BW-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
@@ -24,11 +24,11 @@ define <32 x i64> @test_load_32i64(ptr %ptrs, <32 x i1> %mask, <32 x i64> %src0)
; AVX512BW-NEXT: vpsllw $7, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k1
; AVX512BW-NEXT: vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k2
+; AVX512BW-NEXT: kshiftrd $8, %k1, %k2
; AVX512BW-NEXT: vpblendmq 64(%rdi), %zmm2, %zmm1 {%k2}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
-; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: vpblendmq 128(%rdi), %zmm3, %zmm2 {%k2}
+; AVX512BW-NEXT: kshiftrd $24, %k1, %k1
; AVX512BW-NEXT: vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1}
; AVX512BW-NEXT: retq
%res = call <32 x i64> @llvm.masked.load.v32i64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index bd52b9cd41584c..f6e5986afac531 100644
--- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -261,11 +261,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
; SKX-NEXT: vpsllw $7, %ymm0, %ymm0
; SKX-NEXT: vpmovb2m %ymm0, %k1
; SKX-NEXT: vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: kshiftrd $8, %k1, %k2
; SKX-NEXT: vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
-; SKX-NEXT: kshiftrd $16, %k1, %k1
-; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
-; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: kshiftrd $16, %k1, %k2
+; SKX-NEXT: vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
+; SKX-NEXT: kshiftrd $24, %k1, %k1
; SKX-NEXT: vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
; SKX-NEXT: retq
%res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll
index 83d3a33572266f..c879cb9867ab29 100644
--- a/llvm/test/CodeGen/X86/pr33349.ll
+++ b/llvm/test/CodeGen/X86/pr33349.ll
@@ -17,23 +17,23 @@ target triple = "x86_64-unknown-linux-gnu"
; KNL-NEXT: fldz
; KNL-NEXT: fld %st(0)
; KNL-NEXT: fcmovne %st(2), %st
-; KNL-NEXT: testb $2, %al
-; KNL-NEXT: fld %st(1)
-; KNL-NEXT: fcmovne %st(3), %st
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $1, %al
+; KNL-NEXT: fld %st(1)
+; KNL-NEXT: fcmovne %st(3), %st
+; KNL-NEXT: testb $2, %al
; KNL-NEXT: fld %st(2)
; KNL-NEXT: fcmovne %st(4), %st
-; KNL-NEXT: testb $2, %al
+; KNL-NEXT: testb $8, %al
; KNL-NEXT: fxch %st(3)
; KNL-NEXT: fcmovne %st(4), %st
; KNL-NEXT: fstp %st(4)
; KNL-NEXT: fxch %st(3)
+; KNL-NEXT: fstpt 30(%rdi)
+; KNL-NEXT: fxch %st(1)
; KNL-NEXT: fstpt 10(%rdi)
; KNL-NEXT: fxch %st(1)
; KNL-NEXT: fstpt (%rdi)
-; KNL-NEXT: fxch %st(1)
-; KNL-NEXT: fstpt 30(%rdi)
; KNL-NEXT: fstpt 20(%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
@@ -49,23 +49,23 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX-NEXT: fldz
; SKX-NEXT: fld %st(0)
; SKX-NEXT: fcmovne %st(2), %st
-; SKX-NEXT: testb $2, %al
-; SKX-NEXT: fld %st(1)
-; SKX-NEXT: fcmovne %st(3), %st
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
+; SKX-NEXT: fld %st(1)
+; SKX-NEXT: fcmovne %st(3), %st
+; SKX-NEXT: testb $2, %al
; SKX-NEXT: fld %st(2)
; SKX-NEXT: fcmovne %st(4), %st
-; SKX-NEXT: testb $2, %al
+; SKX-NEXT: testb $8, %al
; SKX-NEXT: fxch %st(3)
; SKX-NEXT: fcmovne %st(4), %st
; SKX-NEXT: fstp %st(4)
; SKX-NEXT: fxch %st(3)
+; SKX-NEXT: fstpt 30(%rdi)
+; SKX-NEXT: fxch %st(1)
; SKX-NEXT: fstpt 10(%rdi)
; SKX-NEXT: fxch %st(1)
; SKX-NEXT: fstpt (%rdi)
-; SKX-NEXT: fxch %st(1)
-; SKX-NEXT: fstpt 30(%rdi)
; SKX-NEXT: fstpt 20(%rdi)
; SKX-NEXT: retq
bb:
diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll
index 29922c2ac1a716..5b2431eb214955 100644
--- a/llvm/test/CodeGen/X86/pr34177.ll
+++ b/llvm/test/CodeGen/X86/pr34177.ll
@@ -51,18 +51,18 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
; AVX512VL-NEXT: kshiftrb $2, %k0, %k1
; AVX512VL-NEXT: kmovd %k0, %eax
-; AVX512VL-NEXT: testb $2, %al
+; AVX512VL-NEXT: testb $8, %al
; AVX512VL-NEXT: fld1
; AVX512VL-NEXT: fldz
; AVX512VL-NEXT: fld %st(0)
; AVX512VL-NEXT: fcmovne %st(2), %st
-; AVX512VL-NEXT: testb $1, %al
+; AVX512VL-NEXT: testb $2, %al
; AVX512VL-NEXT: fld %st(1)
; AVX512VL-NEXT: fcmovne %st(3), %st
-; AVX512VL-NEXT: kmovd %k1, %eax
-; AVX512VL-NEXT: testb $2, %al
+; AVX512VL-NEXT: testb $1, %al
; AVX512VL-NEXT: fld %st(2)
; AVX512VL-NEXT: fcmovne %st(4), %st
+; AVX512VL-NEXT: kmovd %k1, %eax
; AVX512VL-NEXT: testb $1, %al
; AVX512VL-NEXT: fxch %st(3)
; AVX512VL-NEXT: fcmovne %st(4), %st
@@ -77,12 +77,12 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
; AVX512VL-NEXT: fstpt 10(%rdi)
; AVX512VL-NEXT: fxch %st(1)
; AVX512VL-NEXT: fadd %st, %st(0)
+; AVX512VL-NEXT: fstpt 60(%rdi)
+; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt 20(%rdi)
; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt (%rdi)
; AVX512VL-NEXT: fadd %st, %st(0)
-; AVX512VL-NEXT: fstpt 60(%rdi)
-; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt 40(%rdi)
%1 = icmp eq <4 x i64> <i64 0, i64 1, i64 2, i64 3>, %a
%2 = select <4 x i1> %1, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 22b5246443fa8a..7e081310c35be5 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -2668,11 +2668,11 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpcmpneqb %zmm1, %zmm0, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi)
; AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 4d7d2573183e07..68c6ca93576b76 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2329,11 +2329,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi)
; AVX512BW-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index f8c076db65de94..17b98b5ebcaeae 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -840,12 +840,12 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: subq $576, %rsp # imm = 0x240
; AVX512VL-NEXT: vpsllw $7, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovb2m %zmm0, %k1
+; AVX512VL-NEXT: kshiftrq $48, %k1, %k3
; AVX512VL-NEXT: kshiftrq $32, %k1, %k4
-; AVX512VL-NEXT: kshiftrd $16, %k4, %k3
-; AVX512VL-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VL-NEXT: kshiftrq $16, %k1, %k2
; AVX512VL-NEXT: vpcompressd %zmm1, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp)
-; AVX512VL-NEXT: kshiftrw $8, %k1, %k0
+; AVX512VL-NEXT: kshiftrq $8, %k1, %k0
; AVX512VL-NEXT: kxorw %k0, %k1, %k0
; AVX512VL-NEXT: kshiftrw $4, %k0, %k5
; AVX512VL-NEXT: kxorw %k5, %k0, %k0
@@ -859,7 +859,7 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
; AVX512VL-NEXT: vmovdqa64 %zmm0, (%rsp,%rax,4)
; AVX512VL-NEXT: vpcompressd %zmm3, %zmm0 {%k4} {z}
; AVX512VL-NEXT: vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: kshiftrw $8, %k4, %k0
+; AVX512VL-NEXT: kshiftrq $40, %k1, %k0
; AVX512VL-NEXT: kxorw %k0, %k4, %k0
; AVX512VL-NEXT: kshiftrw $4, %k0, %k4
; AVX512VL-NEXT: kxorw %k4, %k0, %k0
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 85e782e9083492..36a902637272d7 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -256,12 +256,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -277,12 +277,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -409,19 +409,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
@@ -444,19 +444,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
@@ -2605,12 +2605,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -2626,12 +2626,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -2753,19 +2753,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
@@ -2788,19 +2788,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx)
@@ -3000,33 +3000,33 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k4, %k5
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k4, %k5
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k4, %k5
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k4, %k4
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k3, %k4
; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k3, %k4
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k3, %k4
; AVX512BW-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k3, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512BW-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx)
@@ -3063,33 +3063,33 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k4, %k5
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k4, %k5
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k4, %k5
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k4, %k4
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k3, %k4
; AVX512VBMI-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k3, %k4
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k3, %k4
; AVX512VBMI-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k3, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k2, %k3
; AVX512VBMI-ONLY-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k2, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm15, 896(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm14, 960(%rdx)
@@ -3309,14 +3309,14 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512BW-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
; AVX512BW-ONLY-NEXT: kmovq %rax, %k1
; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
-; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
@@ -3330,14 +3330,14 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512VBMI-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1
; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1
+; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm0 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
-; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm1, 128(%rdx)
-; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
+; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT: vmovdqa %ymm0, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vzeroupper
; AVX512VBMI-ONLY-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
@@ -9338,12 +9338,12 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512BW-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
; AVX512BW-ONLY-NEXT: kmovq %rax, %k1
; AVX512BW-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k3
-; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
+; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx)
@@ -9362,12 +9362,12 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512VBMI-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
; AVX512VBMI-ONLY-NEXT: kmovq %rax, %k1
; AVX512VBMI-ONLY-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k2, %k3
-; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k1
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 64(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 128(%rdx)
@@ -12938,12 +12938,12 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -12959,12 +12959,12 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT: kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2
; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx)
; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx)
@@ -13088,19 +13088,19 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
; AVX512BW-NEXT: vpmovb2m %zmm0, %k2
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT: kshiftrq $48, %k2, %k3
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm6, 448(%rdx)
@@ -13299,33 +13299,33 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vpmovb2m %zmm1, %k3
; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k4
-; AVX512BW-NEXT: kshiftrd $16, %k4, %k5
+; AVX512BW-NEXT: kshiftrq $16, %k4, %k5
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k4, %k4
-; AVX512BW-NEXT: kshiftrd $16, %k4, %k5
+; AVX512BW-NEXT: kshiftrq $48, %k4, %k5
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k4, %k4
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k3, %k4
+; AVX512BW-NEXT: kshiftrq $16, %k3, %k4
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k3, %k3
-; AVX512BW-NEXT: kshiftrd $16, %k3, %k4
+; AVX512BW-NEXT: kshiftrq $48, %k3, %k4
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k3, %k3
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT: kshiftrq $16, %k2, %k3
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT: kshiftrq $48, %k2, %k3
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm15, 896(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm14, 960(%rdx)
@@ -13682,8 +13682,8 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
-; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm16
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm15
+; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm15
+; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm16
; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10
; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5
@@ -13691,73 +13691,73 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1
; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovb2m %zmm0, %k2
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT: kshiftrq $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k2, %k1
+; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT: kshiftrq $48, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k2} {z}
; AVX512BW-NEXT: vpmovb2m %zmm5, %k2
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm8 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT: kshiftrq $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k2, %k1
+; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
; AVX512BW-NEXT: vpmovb2m %zmm10, %k1
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT: kshiftrq $48, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm14 {%k2} {z}
-; AVX512BW-NEXT: vpmovb2m %zmm15, %k2
-; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm17 {%k2} {z}
+; AVX512BW-NEXT: vpmovb2m %zmm16, %k2
+; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm16 {%k1} {z}
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm18 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT: kshiftrq $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm19 {%k1} {z}
-; AVX512BW-NEXT: vpmovb2m %zmm16, %k1
-; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k2} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT: kshiftrq $32, %k2, %k1
+; AVX512BW-NEXT: vmovdqa32 1152(%rsi), %zmm20 {%k1} {z}
+; AVX512BW-NEXT: vpmovb2m %zmm15, %k1
+; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm15 {%k2} {z}
+; AVX512BW-NEXT: kshiftrq $48, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm21 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z}
; AVX512BW-NEXT: vpmovb2m %zmm12, %k2
; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT: kshiftrq $16, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 1600(%rsi), %zmm25 {%k1} {z}
+; AVX512BW-NEXT: kshiftrq $32, %k2, %k1
+; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z}
; AVX512BW-NEXT: vpmovb2m %zmm7, %k1
; AVX512BW-NEXT: vmovdqa32 1536(%rsi), %zmm7 {%k2} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT: kshiftrq $48, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 1728(%rsi), %zmm27 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k1, %k2
; AVX512BW-NEXT: vmovdqa32 1856(%rsi), %zmm28 {%k2} {z}
-; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm29 {%k1} {z}
-; AVX512BW-NEXT: kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT: vmovdqa32 1920(%rsi), %zmm29 {%k2} {z}
+; AVX512BW-NEXT: vmovdqa32 1792(%rsi), %zmm30 {%k1} {z}
+; AVX512BW-NEXT: kshiftrq $48, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm31, 1984(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm30, 1920(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm29, 1920(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm28, 1856(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm29, 1792(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm30, 1792(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm27, 1728(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm26, 1664(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm25, 1600(%rdx)
@@ -13769,11 +13769,11 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm16, 1024(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm15, 1024(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm18, 960(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm17, 896(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx)
-; AVX512BW-NEXT: vmovdqa64 %zmm15, 768(%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm16, 768(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx)
More information about the llvm-commits
mailing list