[llvm] [X86] combineKSHIFT - fold kshiftr(kshiftr/extract_subvector(X, C1), C2) --> kshiftr(X, C1+C2) (PR #115528)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 8 10:11:29 PST 2024


https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/115528

Merge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel.

>From cc543bb93d6c9669a7d73b6a6a4165bf7f13d1db Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 8 Nov 2024 18:10:09 +0000
Subject: [PATCH] [X86] combineKSHIFT - fold
 kshiftr(kshiftr/extract_subvector(X,C1),C2) --> kshiftr(X,C1+C2)

Merge serial KSHIFTR nodes, possibly separated by EXTRACT_SUBVECTOR, to allow mask instructions to be computed in parallel.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  23 +-
 llvm/test/CodeGen/X86/avx512-bugfix-26264.ll  |  16 +-
 .../CodeGen/X86/avx512-masked-memop-64-32.ll  |   8 +-
 llvm/test/CodeGen/X86/pr33349.ll              |  24 +-
 llvm/test/CodeGen/X86/pr34177.ll              |  12 +-
 llvm/test/CodeGen/X86/vec_smulo.ll            |   8 +-
 llvm/test/CodeGen/X86/vec_umulo.ll            |   8 +-
 llvm/test/CodeGen/X86/vector-compress.ll      |   8 +-
 .../CodeGen/X86/vector-replicaton-i1-mask.ll  | 290 +++++++++---------
 9 files changed, 208 insertions(+), 189 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 19a85a6d7ec6ce..748f885e3f8d90 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58405,11 +58405,30 @@ static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,
 static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI) {
   EVT VT = N->getValueType(0);
-
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
     return DAG.getConstant(0, SDLoc(N), VT);
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // Fold kshiftr(extract_subvector(X,C1),C2)
+  //  --> extract_subvector(kshiftr(X,C1+C2),0)
+  // Fold kshiftr(kshiftr(X,C1),C2) --> kshiftr(X,C1+C2)
+  if (N->getOpcode() == X86ISD::KSHIFTR) {
+    SDLoc DL(N);
+    if (N->getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR ||
+        N->getOperand(0).getOpcode() == X86ISD::KSHIFTR) {
+      SDValue Src = N->getOperand(0).getOperand(0);
+      uint64_t Amt = N->getConstantOperandVal(1) +
+                     N->getOperand(0).getConstantOperandVal(1);
+      EVT SrcVT = Src.getValueType();
+      if (TLI.isTypeLegal(SrcVT) && Amt < SrcVT.getVectorNumElements()) {
+        SDValue Shift = DAG.getNode(X86ISD::KSHIFTR, DL, SrcVT, Src,
+                                    DAG.getTargetConstant(Amt, DL, MVT::i8));
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shift,
+                           DAG.getIntPtrConstant(0, DL));
+      }
+    }
+  }
+
   APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
   if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
     return SDValue(N, 0);
diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
index 537f42dd9c2c59..e0f3b6c4ec90a4 100644
--- a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
+++ b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll
@@ -7,11 +7,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
 ; AVX512BW-NEXT:    vpsllw $7, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovb2m %zmm0, %k1
 ; AVX512BW-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512BW-NEXT:    kshiftrd $8, %k1, %k2
 ; AVX512BW-NEXT:    vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
-; AVX512BW-NEXT:    vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
+; AVX512BW-NEXT:    kshiftrd $24, %k1, %k1
 ; AVX512BW-NEXT:    vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
 ; AVX512BW-NEXT:    retq
   %res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
@@ -24,11 +24,11 @@ define <32 x i64> @test_load_32i64(ptr %ptrs, <32 x i1> %mask, <32 x i64> %src0)
 ; AVX512BW-NEXT:    vpsllw $7, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovb2m %zmm0, %k1
 ; AVX512BW-NEXT:    vpblendmq (%rdi), %zmm1, %zmm0 {%k1}
-; AVX512BW-NEXT:    kshiftrw $8, %k1, %k2
+; AVX512BW-NEXT:    kshiftrd $8, %k1, %k2
 ; AVX512BW-NEXT:    vpblendmq 64(%rdi), %zmm2, %zmm1 {%k2}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
-; AVX512BW-NEXT:    vpblendmq 128(%rdi), %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    kshiftrw $8, %k1, %k1
+; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    vpblendmq 128(%rdi), %zmm3, %zmm2 {%k2}
+; AVX512BW-NEXT:    kshiftrd $24, %k1, %k1
 ; AVX512BW-NEXT:    vpblendmq 192(%rdi), %zmm4, %zmm3 {%k1}
 ; AVX512BW-NEXT:    retq
   %res = call <32 x i64> @llvm.masked.load.v32i64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0)
diff --git a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index bd52b9cd41584c..f6e5986afac531 100644
--- a/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/llvm/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -261,11 +261,11 @@ define <32 x double> @test_load_32f64(ptr %ptrs, <32 x i1> %mask, <32 x double>
 ; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
 ; SKX-NEXT:    vpmovb2m %ymm0, %k1
 ; SKX-NEXT:    vblendmpd (%rdi), %zmm1, %zmm0 {%k1}
-; SKX-NEXT:    kshiftrw $8, %k1, %k2
+; SKX-NEXT:    kshiftrd $8, %k1, %k2
 ; SKX-NEXT:    vblendmpd 64(%rdi), %zmm2, %zmm1 {%k2}
-; SKX-NEXT:    kshiftrd $16, %k1, %k1
-; SKX-NEXT:    vblendmpd 128(%rdi), %zmm3, %zmm2 {%k1}
-; SKX-NEXT:    kshiftrw $8, %k1, %k1
+; SKX-NEXT:    kshiftrd $16, %k1, %k2
+; SKX-NEXT:    vblendmpd 128(%rdi), %zmm3, %zmm2 {%k2}
+; SKX-NEXT:    kshiftrd $24, %k1, %k1
 ; SKX-NEXT:    vblendmpd 192(%rdi), %zmm4, %zmm3 {%k1}
 ; SKX-NEXT:    retq
   %res = call <32 x double> @llvm.masked.load.v32f64.p0(ptr %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0)
diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll
index 83d3a33572266f..c879cb9867ab29 100644
--- a/llvm/test/CodeGen/X86/pr33349.ll
+++ b/llvm/test/CodeGen/X86/pr33349.ll
@@ -17,23 +17,23 @@ target triple = "x86_64-unknown-linux-gnu"
 ; KNL-NEXT:    fldz
 ; KNL-NEXT:    fld %st(0)
 ; KNL-NEXT:    fcmovne %st(2), %st
-; KNL-NEXT:    testb $2, %al
-; KNL-NEXT:    fld %st(1)
-; KNL-NEXT:    fcmovne %st(3), %st
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    testb $1, %al
+; KNL-NEXT:    fld %st(1)
+; KNL-NEXT:    fcmovne %st(3), %st
+; KNL-NEXT:    testb $2, %al
 ; KNL-NEXT:    fld %st(2)
 ; KNL-NEXT:    fcmovne %st(4), %st
-; KNL-NEXT:    testb $2, %al
+; KNL-NEXT:    testb $8, %al
 ; KNL-NEXT:    fxch %st(3)
 ; KNL-NEXT:    fcmovne %st(4), %st
 ; KNL-NEXT:    fstp %st(4)
 ; KNL-NEXT:    fxch %st(3)
+; KNL-NEXT:    fstpt 30(%rdi)
+; KNL-NEXT:    fxch %st(1)
 ; KNL-NEXT:    fstpt 10(%rdi)
 ; KNL-NEXT:    fxch %st(1)
 ; KNL-NEXT:    fstpt (%rdi)
-; KNL-NEXT:    fxch %st(1)
-; KNL-NEXT:    fstpt 30(%rdi)
 ; KNL-NEXT:    fstpt 20(%rdi)
 ; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
@@ -49,23 +49,23 @@ target triple = "x86_64-unknown-linux-gnu"
 ; SKX-NEXT:    fldz
 ; SKX-NEXT:    fld %st(0)
 ; SKX-NEXT:    fcmovne %st(2), %st
-; SKX-NEXT:    testb $2, %al
-; SKX-NEXT:    fld %st(1)
-; SKX-NEXT:    fcmovne %st(3), %st
 ; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    testb $1, %al
+; SKX-NEXT:    fld %st(1)
+; SKX-NEXT:    fcmovne %st(3), %st
+; SKX-NEXT:    testb $2, %al
 ; SKX-NEXT:    fld %st(2)
 ; SKX-NEXT:    fcmovne %st(4), %st
-; SKX-NEXT:    testb $2, %al
+; SKX-NEXT:    testb $8, %al
 ; SKX-NEXT:    fxch %st(3)
 ; SKX-NEXT:    fcmovne %st(4), %st
 ; SKX-NEXT:    fstp %st(4)
 ; SKX-NEXT:    fxch %st(3)
+; SKX-NEXT:    fstpt 30(%rdi)
+; SKX-NEXT:    fxch %st(1)
 ; SKX-NEXT:    fstpt 10(%rdi)
 ; SKX-NEXT:    fxch %st(1)
 ; SKX-NEXT:    fstpt (%rdi)
-; SKX-NEXT:    fxch %st(1)
-; SKX-NEXT:    fstpt 30(%rdi)
 ; SKX-NEXT:    fstpt 20(%rdi)
 ; SKX-NEXT:    retq
  bb:
diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll
index 29922c2ac1a716..5b2431eb214955 100644
--- a/llvm/test/CodeGen/X86/pr34177.ll
+++ b/llvm/test/CodeGen/X86/pr34177.ll
@@ -51,18 +51,18 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
 ; AVX512VL-NEXT:    vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
 ; AVX512VL-NEXT:    kshiftrb $2, %k0, %k1
 ; AVX512VL-NEXT:    kmovd %k0, %eax
-; AVX512VL-NEXT:    testb $2, %al
+; AVX512VL-NEXT:    testb $8, %al
 ; AVX512VL-NEXT:    fld1
 ; AVX512VL-NEXT:    fldz
 ; AVX512VL-NEXT:    fld %st(0)
 ; AVX512VL-NEXT:    fcmovne %st(2), %st
-; AVX512VL-NEXT:    testb $1, %al
+; AVX512VL-NEXT:    testb $2, %al
 ; AVX512VL-NEXT:    fld %st(1)
 ; AVX512VL-NEXT:    fcmovne %st(3), %st
-; AVX512VL-NEXT:    kmovd %k1, %eax
-; AVX512VL-NEXT:    testb $2, %al
+; AVX512VL-NEXT:    testb $1, %al
 ; AVX512VL-NEXT:    fld %st(2)
 ; AVX512VL-NEXT:    fcmovne %st(4), %st
+; AVX512VL-NEXT:    kmovd %k1, %eax
 ; AVX512VL-NEXT:    testb $1, %al
 ; AVX512VL-NEXT:    fxch %st(3)
 ; AVX512VL-NEXT:    fcmovne %st(4), %st
@@ -77,12 +77,12 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
 ; AVX512VL-NEXT:    fstpt 10(%rdi)
 ; AVX512VL-NEXT:    fxch %st(1)
 ; AVX512VL-NEXT:    fadd %st, %st(0)
+; AVX512VL-NEXT:    fstpt 60(%rdi)
+; AVX512VL-NEXT:    fadd %st, %st(0)
 ; AVX512VL-NEXT:    fstpt 20(%rdi)
 ; AVX512VL-NEXT:    fadd %st, %st(0)
 ; AVX512VL-NEXT:    fstpt (%rdi)
 ; AVX512VL-NEXT:    fadd %st, %st(0)
-; AVX512VL-NEXT:    fstpt 60(%rdi)
-; AVX512VL-NEXT:    fadd %st, %st(0)
 ; AVX512VL-NEXT:    fstpt 40(%rdi)
   %1 = icmp eq <4 x i64> <i64 0, i64 1, i64 2, i64 3>, %a
   %2 = select <4 x i1> %1, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll
index 22b5246443fa8a..7e081310c35be5 100644
--- a/llvm/test/CodeGen/X86/vec_smulo.ll
+++ b/llvm/test/CodeGen/X86/vec_smulo.ll
@@ -2668,11 +2668,11 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
 ; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
 ; AVX512BW-NEXT:    vpcmpneqb %zmm1, %zmm0, %k1
 ; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
-; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512BW-NEXT:    kshiftrq $48, %k1, %k1
 ; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rdi)
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 4d7d2573183e07..68c6ca93576b76 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -2329,11 +2329,11 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
 ; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vptestmb %zmm0, %zmm0, %k1
 ; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 {%k2} {z} = -1
-; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1
+; AVX512BW-NEXT:    kshiftrq $48, %k1, %k1
 ; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rdi)
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index f8c076db65de94..17b98b5ebcaeae 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -840,12 +840,12 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
 ; AVX512VL-NEXT:    subq $576, %rsp # imm = 0x240
 ; AVX512VL-NEXT:    vpsllw $7, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vpmovb2m %zmm0, %k1
+; AVX512VL-NEXT:    kshiftrq $48, %k1, %k3
 ; AVX512VL-NEXT:    kshiftrq $32, %k1, %k4
-; AVX512VL-NEXT:    kshiftrd $16, %k4, %k3
-; AVX512VL-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VL-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512VL-NEXT:    vpcompressd %zmm1, %zmm0 {%k1} {z}
 ; AVX512VL-NEXT:    vmovdqa64 %zmm0, (%rsp)
-; AVX512VL-NEXT:    kshiftrw $8, %k1, %k0
+; AVX512VL-NEXT:    kshiftrq $8, %k1, %k0
 ; AVX512VL-NEXT:    kxorw %k0, %k1, %k0
 ; AVX512VL-NEXT:    kshiftrw $4, %k0, %k5
 ; AVX512VL-NEXT:    kxorw %k5, %k0, %k0
@@ -859,7 +859,7 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i
 ; AVX512VL-NEXT:    vmovdqa64 %zmm0, (%rsp,%rax,4)
 ; AVX512VL-NEXT:    vpcompressd %zmm3, %zmm0 {%k4} {z}
 ; AVX512VL-NEXT:    vmovdqa64 %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    kshiftrw $8, %k4, %k0
+; AVX512VL-NEXT:    kshiftrq $40, %k1, %k0
 ; AVX512VL-NEXT:    kxorw %k0, %k4, %k0
 ; AVX512VL-NEXT:    kshiftrw $4, %k0, %k4
 ; AVX512VL-NEXT:    kxorw %k4, %k0, %k0
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 85e782e9083492..36a902637272d7 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -256,12 +256,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
 ; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
 ; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
@@ -277,12 +277,12 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
@@ -409,19 +409,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
 ; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k2
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k2, %k3
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k2, %k3
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
@@ -444,19 +444,19 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k2
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k2, %k3
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k2, %k3
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
@@ -2605,12 +2605,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63]
 ; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
@@ -2626,12 +2626,12 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
@@ -2753,19 +2753,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
 ; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k2
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k2, %k3
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k2, %k3
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
@@ -2788,19 +2788,19 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k2
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k2, %k3
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k2, %k3
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
@@ -3000,33 +3000,33 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-ONLY-NEXT:    vpshufb %zmm2, %zmm0, %zmm0
 ; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k4
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k4, %k5
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k4} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k4, %k4
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k4, %k5
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k4, %k4
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k3, %k4
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k3, %k3
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k3, %k4
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k3, %k3
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k2, %k3
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k2, %k3
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k2, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm15, 896(%rdx)
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm14, 960(%rdx)
@@ -3063,33 +3063,33 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k4
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k4, %k5
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k4} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k4, %k4
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k4, %k5
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k4, %k4
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k3, %k4
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k3, %k3
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k3, %k4
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k3, %k3
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k2, %k3
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k2, %k3
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k2, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm15, 896(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm14, 960(%rdx)
@@ -3309,14 +3309,14 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW-ONLY-NEXT:    movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
 ; AVX512BW-ONLY-NEXT:    kmovq %rax, %k1
 ; AVX512BW-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
-; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
 ; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k2
-; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k1, %k1
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
-; AVX512BW-ONLY-NEXT:    vmovdqa %ymm1, 128(%rdx)
-; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT:    vmovdqa %ymm0, 128(%rdx)
 ; AVX512BW-ONLY-NEXT:    vzeroupper
 ; AVX512BW-ONLY-NEXT:    retq
 ;
@@ -3330,14 +3330,14 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512VBMI-ONLY-NEXT:    movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
 ; AVX512VBMI-ONLY-NEXT:    kmovq %rax, %k1
 ; AVX512VBMI-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
-; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm0 {%k1} {z}
 ; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k2
-; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm1 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm0 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k1, %k1
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 64(%rdx)
-; AVX512VBMI-ONLY-NEXT:    vmovdqa %ymm1, 128(%rdx)
-; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm0, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm1, (%rdx)
+; AVX512VBMI-ONLY-NEXT:    vmovdqa %ymm0, 128(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vzeroupper
 ; AVX512VBMI-ONLY-NEXT:    retq
   %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
@@ -9338,12 +9338,12 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW-ONLY-NEXT:    movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
 ; AVX512BW-ONLY-NEXT:    kmovq %rax, %k1
 ; AVX512BW-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k2
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k2, %k3
-; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k1, %k2
+; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm0 {%k2} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k1, %k1
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
@@ -9362,12 +9362,12 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512VBMI-ONLY-NEXT:    movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
 ; AVX512VBMI-ONLY-NEXT:    kmovq %rax, %k1
 ; AVX512VBMI-ONLY-NEXT:    vpcmpgtb %zmm0, %zmm1, %k1 {%k1}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k2
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k2, %k3
-; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm0 {%k3} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm0 {%k2} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k1, %k1
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm3 {%k1} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 64(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 128(%rdx)
@@ -12938,12 +12938,12 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW-ONLY-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
 ; AVX512BW-ONLY-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
 ; AVX512BW-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512BW-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512BW-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
 ; AVX512BW-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
@@ -12959,12 +12959,12 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vpmovb2m %zmm0, %k1
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k2} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512VBMI-ONLY-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k2} {z}
+; AVX512VBMI-ONLY-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k1} {z}
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm3, 128(%rdx)
 ; AVX512VBMI-ONLY-NEXT:    vmovdqa64 %zmm2, 192(%rdx)
@@ -13088,19 +13088,19 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55]
 ; AVX512BW-NEXT:    vpmovb2m %zmm0, %k2
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT:    kshiftrq $16, %k2, %k3
 ; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k3} {z}
 ; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT:    kshiftrq $48, %k2, %k3
 ; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k3} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
 ; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
 ; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k1} {z}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, 384(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, 448(%rdx)
@@ -13299,33 +13299,33 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-NEXT:    vpmovb2m %zmm1, %k3
 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovb2m %zmm0, %k4
-; AVX512BW-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512BW-NEXT:    kshiftrq $16, %k4, %k5
 ; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k5} {z}
 ; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k4} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k4, %k4
-; AVX512BW-NEXT:    kshiftrd $16, %k4, %k5
+; AVX512BW-NEXT:    kshiftrq $48, %k4, %k5
 ; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm2 {%k5} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k4, %k4
 ; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm3 {%k4} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512BW-NEXT:    kshiftrq $16, %k3, %k4
 ; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k4} {z}
 ; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k3} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k3, %k3
-; AVX512BW-NEXT:    kshiftrd $16, %k3, %k4
+; AVX512BW-NEXT:    kshiftrq $48, %k3, %k4
 ; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm6 {%k4} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k3, %k3
 ; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm7 {%k3} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT:    kshiftrq $16, %k2, %k3
 ; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm8 {%k3} {z}
 ; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm9 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k3
+; AVX512BW-NEXT:    kshiftrq $48, %k2, %k3
 ; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm10 {%k3} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
 ; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm12 {%k2} {z}
 ; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm13 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $48, %k1, %k2
 ; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm14 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
 ; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm15 {%k1} {z}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, 896(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, 960(%rdx)
@@ -13682,8 +13682,8 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7]
 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm12
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5]
-; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm16
-; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm15
+; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm15
+; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm16
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3]
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm1, %zmm10
 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm1, %zmm5
@@ -13691,73 +13691,73 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-NEXT:    vpshufb %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpshufb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovb2m %zmm0, %k2
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT:    kshiftrq $16, %k2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 64(%rsi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
 ; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
 ; AVX512BW-NEXT:    vmovdqa32 (%rsi), %zmm1 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT:    vmovdqa32 128(%rsi), %zmm2 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $48, %k2, %k2
 ; AVX512BW-NEXT:    vmovdqa32 192(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-NEXT:    vmovdqa32 320(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k2} {z}
 ; AVX512BW-NEXT:    vpmovb2m %zmm5, %k2
 ; AVX512BW-NEXT:    vmovdqa32 256(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT:    vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $48, %k1, %k1
 ; AVX512BW-NEXT:    vmovdqa32 448(%rsi), %zmm8 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT:    kshiftrq $16, %k2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k1} {z}
 ; AVX512BW-NEXT:    vpmovb2m %zmm10, %k1
 ; AVX512BW-NEXT:    vmovdqa32 512(%rsi), %zmm10 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT:    vmovdqa32 640(%rsi), %zmm11 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $48, %k2, %k2
 ; AVX512BW-NEXT:    vmovdqa32 704(%rsi), %zmm13 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-NEXT:    vmovdqa32 832(%rsi), %zmm14 {%k2} {z}
-; AVX512BW-NEXT:    vpmovb2m %zmm15, %k2
-; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm15 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm17 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 896(%rsi), %zmm17 {%k2} {z}
+; AVX512BW-NEXT:    vpmovb2m %zmm16, %k2
+; AVX512BW-NEXT:    vmovdqa32 768(%rsi), %zmm16 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $48, %k1, %k1
 ; AVX512BW-NEXT:    vmovdqa32 960(%rsi), %zmm18 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT:    kshiftrq $16, %k2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 1088(%rsi), %zmm19 {%k1} {z}
-; AVX512BW-NEXT:    vpmovb2m %zmm16, %k1
-; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm16 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm20 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 1152(%rsi), %zmm20 {%k1} {z}
+; AVX512BW-NEXT:    vpmovb2m %zmm15, %k1
+; AVX512BW-NEXT:    vmovdqa32 1024(%rsi), %zmm15 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $48, %k2, %k2
 ; AVX512BW-NEXT:    vmovdqa32 1216(%rsi), %zmm21 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-NEXT:    vmovdqa32 1344(%rsi), %zmm22 {%k2} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 1408(%rsi), %zmm23 {%k2} {z}
 ; AVX512BW-NEXT:    vpmovb2m %zmm12, %k2
 ; AVX512BW-NEXT:    vmovdqa32 1280(%rsi), %zmm12 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT:    vmovdqa32 1408(%rsi), %zmm23 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $48, %k1, %k1
 ; AVX512BW-NEXT:    vmovdqa32 1472(%rsi), %zmm24 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k1
+; AVX512BW-NEXT:    kshiftrq $16, %k2, %k1
 ; AVX512BW-NEXT:    vmovdqa32 1600(%rsi), %zmm25 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $32, %k2, %k1
+; AVX512BW-NEXT:    vmovdqa32 1664(%rsi), %zmm26 {%k1} {z}
 ; AVX512BW-NEXT:    vpmovb2m %zmm7, %k1
 ; AVX512BW-NEXT:    vmovdqa32 1536(%rsi), %zmm7 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k2, %k2
-; AVX512BW-NEXT:    vmovdqa32 1664(%rsi), %zmm26 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k2, %k2
+; AVX512BW-NEXT:    kshiftrq $48, %k2, %k2
 ; AVX512BW-NEXT:    vmovdqa32 1728(%rsi), %zmm27 {%k2} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
+; AVX512BW-NEXT:    kshiftrq $16, %k1, %k2
 ; AVX512BW-NEXT:    vmovdqa32 1856(%rsi), %zmm28 {%k2} {z}
-; AVX512BW-NEXT:    vmovdqa32 1792(%rsi), %zmm29 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
-; AVX512BW-NEXT:    vmovdqa32 1920(%rsi), %zmm30 {%k1} {z}
-; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
+; AVX512BW-NEXT:    kshiftrq $32, %k1, %k2
+; AVX512BW-NEXT:    vmovdqa32 1920(%rsi), %zmm29 {%k2} {z}
+; AVX512BW-NEXT:    vmovdqa32 1792(%rsi), %zmm30 {%k1} {z}
+; AVX512BW-NEXT:    kshiftrq $48, %k1, %k1
 ; AVX512BW-NEXT:    vmovdqa32 1984(%rsi), %zmm31 {%k1} {z}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm31, 1984(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm30, 1920(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm29, 1920(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm28, 1856(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm29, 1792(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm30, 1792(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm27, 1728(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, 1664(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, 1600(%rdx)
@@ -13769,11 +13769,11 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
 ; AVX512BW-NEXT:    vmovdqa64 %zmm21, 1216(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm20, 1152(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm19, 1088(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm16, 1024(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm15, 1024(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, 960(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, 896(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, 832(%rdx)
-; AVX512BW-NEXT:    vmovdqa64 %zmm15, 768(%rdx)
+; AVX512BW-NEXT:    vmovdqa64 %zmm16, 768(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, 704(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, 640(%rdx)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, 576(%rdx)



More information about the llvm-commits mailing list