[llvm] bbbb93e - Revert "[DAG] Fold insert_subvector undef, (extract_subvector X, 0), 0 with non-matching types"
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 18 12:00:35 PDT 2023
Author: Craig Topper
Date: 2023-08-18T12:00:07-07:00
New Revision: bbbb93eb48debcebe8fffbc845c8794740e2ebd3
URL: https://github.com/llvm/llvm-project/commit/bbbb93eb48debcebe8fffbc845c8794740e2ebd3
DIFF: https://github.com/llvm/llvm-project/commit/bbbb93eb48debcebe8fffbc845c8794740e2ebd3.diff
LOG: Revert "[DAG] Fold insert_subvector undef, (extract_subvector X, 0), 0 with non-matching types"
This reverts commit 770be43f6782dab84d215d01b37396d63a9c2b6e.
Forgot to remove from my tree while experimenting.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
llvm/test/CodeGen/RISCV/rvv/splats-with-mixed-vl.ll
llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
llvm/test/CodeGen/X86/pr33349.ll
llvm/test/CodeGen/X86/pr34177.ll
llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index be314b32cf6b19..fa5ba2efc8a8c3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -25597,25 +25597,10 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
return N0;
// If this is an insert of an extracted vector into an undef vector, we can
- // just use the input to the extract if the types match, and can simplify
- // in some cases even if they don't..
+ // just use the input to the extract.
if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- N1.getOperand(1) == N2) {
- EVT SrcVT = N1.getOperand(0).getValueType();
- if (SrcVT == VT)
- return N1.getOperand(0);
- // TODO: To remove the zero check, need to adjust the offset to
- // a multiple of the new src type.
- if (isNullConstant(N2) &&
- VT.isScalableVector() == SrcVT.isScalableVector()) {
- if (VT.getVectorMinNumElements() >= SrcVT.getVectorMinNumElements())
- return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
- VT, N0, N1.getOperand(0), N2);
- else
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N),
- VT, N1.getOperand(0), N2);
- }
- }
+ N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
+ return N1.getOperand(0);
// Simplify scalar inserts into an undef vector:
// insert_subvector undef, (splat X), N2 -> splat X
diff --git a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
index ddcc787468675f..fe45772fab4f27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/active_lane_mask.ll
@@ -103,15 +103,15 @@ define <8 x i1> @fv8(ptr %p, i64 %index, i64 %tc) {
define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
; CHECK-LABEL: fv32:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI8_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vid.v v16
-; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vmsltu.vx v0, v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: ret
@@ -122,15 +122,15 @@ define <32 x i1> @fv32(ptr %p, i64 %index, i64 %tc) {
define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
; CHECK-LABEL: fv64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI9_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vid.v v16
-; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vmsltu.vx v0, v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI9_1)
@@ -157,15 +157,15 @@ define <64 x i1> @fv64(ptr %p, i64 %index, i64 %tc) {
define <128 x i1> @fv128(ptr %p, i64 %index, i64 %tc) {
; CHECK-LABEL: fv128:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI10_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0)
+; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
; CHECK-NEXT: vle64.v v8, (a0)
-; CHECK-NEXT: vid.v v16
-; CHECK-NEXT: vsaddu.vx v16, v16, a1
-; CHECK-NEXT: vmsltu.vx v0, v16, a2
; CHECK-NEXT: vsaddu.vx v8, v8, a1
; CHECK-NEXT: vmsltu.vx v16, v8, a2
+; CHECK-NEXT: vid.v v8
+; CHECK-NEXT: vsaddu.vx v8, v8, a1
+; CHECK-NEXT: vmsltu.vx v0, v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; CHECK-NEXT: vslideup.vi v0, v16, 2
; CHECK-NEXT: lui a0, %hi(.LCPI10_1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
index a2d02b6bb641b2..77ea6c0b26d0a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
@@ -469,14 +469,13 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vx v13, v10, a0
; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vx v12, v9, a0
+; CHECK-NEXT: vslidedown.vx v8, v9, a0
; CHECK-NEXT: add a1, a0, a0
; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma
-; CHECK-NEXT: vslideup.vx v12, v10, a0
-; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vslideup.vx v8, v10, a0
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma
+; CHECK-NEXT: vslidedown.vx v9, v10, a0
; CHECK-NEXT: ret
%res = call <vscale x 6 x half> @llvm.vector.extract.nxv6f16.nxv12f16(<vscale x 12 x half> %in, i64 6)
ret <vscale x 6 x half> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
index 11f9b2db5ea169..19d50748c0c725 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extload-truncstore.ll
@@ -141,12 +141,13 @@ define <4 x i64> @sextload_v4i8_v4i64(ptr %x) {
; LMULMAX1-LABEL: sextload_v4i8_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf8 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v9, v8
-; LMULMAX1-NEXT: vsext.vf8 v8, v10
+; LMULMAX1-NEXT: vsext.vf8 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v4i8_v4i64:
@@ -164,12 +165,13 @@ define <4 x i64> @zextload_v4i8_v4i64(ptr %x) {
; LMULMAX1-LABEL: zextload_v4i8_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf8 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v9, v8
-; LMULMAX1-NEXT: vzext.vf8 v8, v10
+; LMULMAX1-NEXT: vzext.vf8 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v4i8_v4i64:
@@ -211,12 +213,13 @@ define <8 x i32> @sextload_v8i8_v8i32(ptr %x) {
; LMULMAX1-LABEL: sextload_v8i8_v8i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v9, v8
-; LMULMAX1-NEXT: vsext.vf4 v8, v10
+; LMULMAX1-NEXT: vsext.vf4 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i8_v8i32:
@@ -234,12 +237,13 @@ define <8 x i32> @zextload_v8i8_v8i32(ptr %x) {
; LMULMAX1-LABEL: zextload_v8i8_v8i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v9, v8
-; LMULMAX1-NEXT: vzext.vf4 v8, v10
+; LMULMAX1-NEXT: vzext.vf4 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i8_v8i32:
@@ -265,13 +269,13 @@ define <8 x i64> @sextload_v8i8_v8i64(ptr %x) {
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf8 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v11, v12
-; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf8 v9, v12
+; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf8 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i8_v8i64:
@@ -297,13 +301,13 @@ define <8 x i64> @zextload_v8i8_v8i64(ptr %x) {
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf8 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v11, v12
-; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf8 v9, v12
+; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf8 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i8_v8i64:
@@ -321,12 +325,13 @@ define <16 x i16> @sextload_v16i8_v16i16(ptr %x) {
; LMULMAX1-LABEL: sextload_v16i8_v16i16:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 8
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i8_v16i16:
@@ -344,12 +349,13 @@ define <16 x i16> @zextload_v16i8_v16i16(ptr %x) {
; LMULMAX1-LABEL: zextload_v16i8_v16i16:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX1-NEXT: vle8.v v10, (a0)
+; LMULMAX1-NEXT: vle8.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 8
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 8
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i8_v16i16:
@@ -375,13 +381,13 @@ define <16 x i32> @sextload_v16i8_v16i32(ptr %x) {
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
-; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v11, v12
-; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v9, v12
+; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i8_v16i32:
@@ -407,13 +413,13 @@ define <16 x i32> @zextload_v16i8_v16i32(ptr %x) {
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf4 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
-; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v11, v12
-; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf4 v9, v12
+; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i8_v16i32:
@@ -442,37 +448,38 @@ define <16 x i64> @sextload_v16i8_v16i64(ptr %x) {
; LMULMAX1-NEXT: vslidedown.vi v13, v10, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf8 v9, v13
+; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v15, v10, 4
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf8 v10, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v14, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf8 v13, v14
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v11, 4
+; LMULMAX1-NEXT: vslidedown.vi v16, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v14, v11
+; LMULMAX1-NEXT: vsext.vf8 v14, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v11, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v15, v11
-; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v15, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v10, v11
+; LMULMAX1-NEXT: vsext.vf8 v11, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v16, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf8 v11, v16
+; LMULMAX1-NEXT: vsext.vf8 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i8_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX4-NEXT: vle8.v v16, (a0)
+; LMULMAX4-NEXT: vle8.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vsext.vf8 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vsext.vf8 v12, v8
-; LMULMAX4-NEXT: vsext.vf8 v8, v16
+; LMULMAX4-NEXT: vsext.vf8 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i8>, ptr %x
%z = sext <16 x i8> %y to <16 x i64>
@@ -494,37 +501,38 @@ define <16 x i64> @zextload_v16i8_v16i64(ptr %x) {
; LMULMAX1-NEXT: vslidedown.vi v13, v10, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf8 v9, v13
+; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v15, v10, 4
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf8 v10, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v14, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf8 v13, v14
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v11, 4
+; LMULMAX1-NEXT: vslidedown.vi v16, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v14, v11
+; LMULMAX1-NEXT: vzext.vf8 v14, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v11, 2
+; LMULMAX1-NEXT: vslidedown.vi v15, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v15, v11
-; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v11, v10, 4
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v10, v11
+; LMULMAX1-NEXT: vzext.vf8 v11, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v16, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf8 v11, v16
+; LMULMAX1-NEXT: vzext.vf8 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i8_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e8, m1, ta, ma
-; LMULMAX4-NEXT: vle8.v v16, (a0)
+; LMULMAX4-NEXT: vle8.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vzext.vf8 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vzext.vf8 v12, v8
-; LMULMAX4-NEXT: vzext.vf8 v8, v16
+; LMULMAX4-NEXT: vzext.vf8 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i8>, ptr %x
%z = zext <16 x i8> %y to <16 x i64>
@@ -652,12 +660,13 @@ define <4 x i64> @sextload_v4i16_v4i64(ptr %x) {
; LMULMAX1-LABEL: sextload_v4i16_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v9, v8
-; LMULMAX1-NEXT: vsext.vf4 v8, v10
+; LMULMAX1-NEXT: vsext.vf4 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v4i16_v4i64:
@@ -675,12 +684,13 @@ define <4 x i64> @zextload_v4i16_v4i64(ptr %x) {
; LMULMAX1-LABEL: zextload_v4i16_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v9, v8
-; LMULMAX1-NEXT: vzext.vf4 v8, v10
+; LMULMAX1-NEXT: vzext.vf4 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v4i16_v4i64:
@@ -710,12 +720,13 @@ define <8 x i32> @sextload_v8i16_v8i32(ptr %x) {
; LMULMAX1-LABEL: sextload_v8i16_v8i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i16_v8i32:
@@ -733,12 +744,13 @@ define <8 x i32> @zextload_v8i16_v8i32(ptr %x) {
; LMULMAX1-LABEL: zextload_v8i16_v8i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i16_v8i32:
@@ -764,13 +776,13 @@ define <8 x i64> @sextload_v8i16_v8i64(ptr %x) {
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v11, v12
-; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v9, v12
+; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i16_v8i64:
@@ -796,13 +808,13 @@ define <8 x i64> @zextload_v8i16_v8i64(ptr %x) {
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf4 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v11, v12
-; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf4 v9, v12
+; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i16_v8i64:
@@ -842,19 +854,20 @@ define <16 x i32> @sextload_v16i16_v16i32(ptr %x) {
; LMULMAX1-LABEL: sextload_v16i16_v16i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle16.v v12, (a0)
+; LMULMAX1-NEXT: vle16.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
+; LMULMAX1-NEXT: vsext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v11, v10
-; LMULMAX1-NEXT: vsext.vf2 v10, v12
+; LMULMAX1-NEXT: vsext.vf2 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i16_v16i32:
@@ -872,19 +885,20 @@ define <16 x i32> @zextload_v16i16_v16i32(ptr %x) {
; LMULMAX1-LABEL: zextload_v16i16_v16i32:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; LMULMAX1-NEXT: vle16.v v10, (a0)
+; LMULMAX1-NEXT: vle16.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle16.v v12, (a0)
+; LMULMAX1-NEXT: vle16.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
+; LMULMAX1-NEXT: vzext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v12, 4
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v11, v10
-; LMULMAX1-NEXT: vzext.vf2 v10, v12
+; LMULMAX1-NEXT: vzext.vf2 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i16_v16i32:
@@ -911,38 +925,39 @@ define <16 x i64> @sextload_v16i16_v16i64(ptr %x) {
; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v10, v11
-; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v11, v12
-; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v9, v12
; LMULMAX1-NEXT: vsext.vf4 v12, v13
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v15, v13, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v14, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v15, v16
+; LMULMAX1-NEXT: vsext.vf4 v9, v16
+; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v11, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v16, v13, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v13, v16
+; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf4 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i16_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX4-NEXT: vle16.v v16, (a0)
+; LMULMAX4-NEXT: vle16.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vsext.vf4 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vsext.vf4 v12, v8
-; LMULMAX4-NEXT: vsext.vf4 v8, v16
+; LMULMAX4-NEXT: vsext.vf4 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i16>, ptr %x
%z = sext <16 x i16> %y to <16 x i64>
@@ -962,38 +977,39 @@ define <16 x i64> @zextload_v16i16_v16i64(ptr %x) {
; LMULMAX1-NEXT: vslidedown.vi v11, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf4 v10, v11
-; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v11, v12
-; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v12, v9, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v9, v12
; LMULMAX1-NEXT: vzext.vf4 v12, v13
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v15, v13, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf4 v14, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf4 v15, v16
+; LMULMAX1-NEXT: vzext.vf4 v9, v16
+; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v16, v11, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v11, v16
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v16, v13, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; LMULMAX1-NEXT: vzext.vf4 v13, v16
+; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf4 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i16_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e16, m2, ta, ma
-; LMULMAX4-NEXT: vle16.v v16, (a0)
+; LMULMAX4-NEXT: vle16.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vzext.vf4 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e16, m2, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vzext.vf4 v12, v8
-; LMULMAX4-NEXT: vzext.vf4 v8, v16
+; LMULMAX4-NEXT: vzext.vf4 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i16>, ptr %x
%z = zext <16 x i16> %y to <16 x i64>
@@ -1080,12 +1096,13 @@ define <4 x i64> @sextload_v4i32_v4i64(ptr %x) {
; LMULMAX1-LABEL: sextload_v4i32_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v4i32_v4i64:
@@ -1103,12 +1120,13 @@ define <4 x i64> @zextload_v4i32_v4i64(ptr %x) {
; LMULMAX1-LABEL: zextload_v4i32_v4i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v4i32_v4i64:
@@ -1177,19 +1195,20 @@ define <8 x i64> @sextload_v8i32_v8i64(ptr %x) {
; LMULMAX1-LABEL: sextload_v8i32_v8i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vle32.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
+; LMULMAX1-NEXT: vsext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v11, v10
-; LMULMAX1-NEXT: vsext.vf2 v10, v12
+; LMULMAX1-NEXT: vsext.vf2 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v8i32_v8i64:
@@ -1207,19 +1226,20 @@ define <8 x i64> @zextload_v8i32_v8i64(ptr %x) {
; LMULMAX1-LABEL: zextload_v8i32_v8i64:
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vle32.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
+; LMULMAX1-NEXT: vzext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v11, v10
-; LMULMAX1-NEXT: vzext.vf2 v10, v12
+; LMULMAX1-NEXT: vzext.vf2 v11, v12
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v8i32_v8i64:
@@ -1308,43 +1328,45 @@ define <16 x i64> @sextload_v16i32_v16i64(ptr %x) {
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: addi a1, a0, 48
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v16, (a1)
+; LMULMAX1-NEXT: vle32.v v15, (a1)
; LMULMAX1-NEXT: addi a1, a0, 32
-; LMULMAX1-NEXT: vle32.v v14, (a1)
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v13, (a1)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vle32.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vsext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v9, v8
-; LMULMAX1-NEXT: vsext.vf2 v8, v10
+; LMULMAX1-NEXT: vsext.vf2 v9, v10
+; LMULMAX1-NEXT: vsext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v11, v10
+; LMULMAX1-NEXT: vsext.vf2 v11, v12
+; LMULMAX1-NEXT: vsext.vf2 v12, v13
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v14, 2
+; LMULMAX1-NEXT: vslidedown.vi v14, v13, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v13, v10
+; LMULMAX1-NEXT: vsext.vf2 v13, v14
+; LMULMAX1-NEXT: vsext.vf2 v14, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v15, v10
-; LMULMAX1-NEXT: vsext.vf2 v10, v12
-; LMULMAX1-NEXT: vsext.vf2 v12, v14
-; LMULMAX1-NEXT: vsext.vf2 v14, v16
+; LMULMAX1-NEXT: vsext.vf2 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: sextload_v16i32_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; LMULMAX4-NEXT: vle32.v v16, (a0)
+; LMULMAX4-NEXT: vle32.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vsext.vf2 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vsext.vf2 v12, v8
-; LMULMAX4-NEXT: vsext.vf2 v8, v16
+; LMULMAX4-NEXT: vsext.vf2 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i32>, ptr %x
%z = sext <16 x i32> %y to <16 x i64>
@@ -1356,43 +1378,45 @@ define <16 x i64> @zextload_v16i32_v16i64(ptr %x) {
; LMULMAX1: # %bb.0:
; LMULMAX1-NEXT: addi a1, a0, 48
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vle32.v v16, (a1)
+; LMULMAX1-NEXT: vle32.v v15, (a1)
; LMULMAX1-NEXT: addi a1, a0, 32
-; LMULMAX1-NEXT: vle32.v v14, (a1)
-; LMULMAX1-NEXT: vle32.v v10, (a0)
+; LMULMAX1-NEXT: vle32.v v13, (a1)
+; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: addi a0, a0, 16
-; LMULMAX1-NEXT: vle32.v v12, (a0)
+; LMULMAX1-NEXT: vle32.v v11, (a0)
+; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; LMULMAX1-NEXT: vzext.vf2 v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v10, 2
+; LMULMAX1-NEXT: vslidedown.vi v10, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v9, v8
-; LMULMAX1-NEXT: vzext.vf2 v8, v10
+; LMULMAX1-NEXT: vzext.vf2 v9, v10
+; LMULMAX1-NEXT: vzext.vf2 v10, v11
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v12, 2
+; LMULMAX1-NEXT: vslidedown.vi v12, v11, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v11, v10
+; LMULMAX1-NEXT: vzext.vf2 v11, v12
+; LMULMAX1-NEXT: vzext.vf2 v12, v13
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v14, 2
+; LMULMAX1-NEXT: vslidedown.vi v14, v13, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v13, v10
+; LMULMAX1-NEXT: vzext.vf2 v13, v14
+; LMULMAX1-NEXT: vzext.vf2 v14, v15
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v16, 2
+; LMULMAX1-NEXT: vslidedown.vi v16, v15, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v15, v10
-; LMULMAX1-NEXT: vzext.vf2 v10, v12
-; LMULMAX1-NEXT: vzext.vf2 v12, v14
-; LMULMAX1-NEXT: vzext.vf2 v14, v16
+; LMULMAX1-NEXT: vzext.vf2 v15, v16
; LMULMAX1-NEXT: ret
;
; LMULMAX4-LABEL: zextload_v16i32_v16i64:
; LMULMAX4: # %bb.0:
; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; LMULMAX4-NEXT: vle32.v v16, (a0)
+; LMULMAX4-NEXT: vle32.v v12, (a0)
+; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; LMULMAX4-NEXT: vzext.vf2 v8, v12
; LMULMAX4-NEXT: vsetivli zero, 8, e32, m4, ta, ma
-; LMULMAX4-NEXT: vslidedown.vi v8, v16, 8
+; LMULMAX4-NEXT: vslidedown.vi v16, v12, 8
; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma
-; LMULMAX4-NEXT: vzext.vf2 v12, v8
-; LMULMAX4-NEXT: vzext.vf2 v8, v16
+; LMULMAX4-NEXT: vzext.vf2 v12, v16
; LMULMAX4-NEXT: ret
%y = load <16 x i32>, ptr %x
%z = zext <16 x i32> %y to <16 x i64>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
index 2339afe576ecbe..e8b39f2758ccb4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-conv.ll
@@ -84,27 +84,27 @@ define void @fpext_v8f16_v8f64(ptr %x, ptr %y) {
; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v9
; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
; LMULMAX1-NEXT: vfwcvt.f.f.v v9, v10
-; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
-; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v8
-; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.f.f.v v11, v10
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
+; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4
+; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v11, v10, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v8
+; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v11
+; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX1-NEXT: vfwcvt.f.f.v v11, v12
+; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
+; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v8
; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX1-NEXT: vfwcvt.f.f.v v8, v12
+; LMULMAX1-NEXT: vsetvli zero, zero, e16, mf4, ta, ma
; LMULMAX1-NEXT: vfwcvt.f.f.v v12, v10
-; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
-; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v8
; LMULMAX1-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.f.f.v v8, v10
-; LMULMAX1-NEXT: addi a0, a1, 48
-; LMULMAX1-NEXT: vse64.v v8, (a0)
+; LMULMAX1-NEXT: vfwcvt.f.f.v v10, v12
; LMULMAX1-NEXT: addi a0, a1, 32
-; LMULMAX1-NEXT: vse64.v v12, (a0)
-; LMULMAX1-NEXT: vse64.v v11, (a1)
+; LMULMAX1-NEXT: vse64.v v10, (a0)
+; LMULMAX1-NEXT: vse64.v v8, (a1)
+; LMULMAX1-NEXT: addi a0, a1, 48
+; LMULMAX1-NEXT: vse64.v v11, (a0)
; LMULMAX1-NEXT: addi a1, a1, 16
; LMULMAX1-NEXT: vse64.v v9, (a1)
; LMULMAX1-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
index 129855f46ac8c4..54ee38ab46716e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp2i.ll
@@ -493,20 +493,20 @@ define void @fp2si_v8f32_v8i64(ptr %x, ptr %y) {
; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v10, v8
-; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v11, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v12, v8
+; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v11, v8
+; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v9, 2
+; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v9, v8
+; LMULMAX1-NEXT: vfwcvt.rtz.x.f.v v12, v9
; LMULMAX1-NEXT: addi a0, a1, 16
-; LMULMAX1-NEXT: vse64.v v9, (a0)
-; LMULMAX1-NEXT: addi a0, a1, 48
; LMULMAX1-NEXT: vse64.v v12, (a0)
-; LMULMAX1-NEXT: vse64.v v11, (a1)
+; LMULMAX1-NEXT: vse64.v v8, (a1)
+; LMULMAX1-NEXT: addi a0, a1, 48
+; LMULMAX1-NEXT: vse64.v v11, (a0)
; LMULMAX1-NEXT: addi a0, a1, 32
; LMULMAX1-NEXT: vse64.v v10, (a0)
; LMULMAX1-NEXT: ret
@@ -533,20 +533,20 @@ define void @fp2ui_v8f32_v8i64(ptr %x, ptr %y) {
; LMULMAX1-NEXT: vle32.v v9, (a0)
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v10, v8
-; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v11, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v12, v8
+; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v11, v8
+; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v8, v9
; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v9, 2
+; LMULMAX1-NEXT: vslidedown.vi v9, v9, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v9, v8
+; LMULMAX1-NEXT: vfwcvt.rtz.xu.f.v v12, v9
; LMULMAX1-NEXT: addi a0, a1, 16
-; LMULMAX1-NEXT: vse64.v v9, (a0)
-; LMULMAX1-NEXT: addi a0, a1, 48
; LMULMAX1-NEXT: vse64.v v12, (a0)
-; LMULMAX1-NEXT: vse64.v v11, (a1)
+; LMULMAX1-NEXT: vse64.v v8, (a1)
+; LMULMAX1-NEXT: addi a0, a1, 48
+; LMULMAX1-NEXT: vse64.v v11, (a0)
; LMULMAX1-NEXT: addi a0, a1, 32
; LMULMAX1-NEXT: vse64.v v10, (a0)
; LMULMAX1-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
index e3cc70fdcd91f3..cb88c29ee85d0a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-i2fp.ll
@@ -449,23 +449,22 @@ define void @si2fp_v8i16_v8f64(ptr %x, ptr %y) {
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; LMULMAX1-NEXT: vsext.vf2 v10, v9
; LMULMAX1-NEXT: vfwcvt.f.x.v v9, v10
-; LMULMAX1-NEXT: vsext.vf2 v10, v8
-; LMULMAX1-NEXT: vfwcvt.f.x.v v11, v10
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
-; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v10, v8
-; LMULMAX1-NEXT: vfwcvt.f.x.v v12, v10
+; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v11, v10, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vsext.vf2 v10, v8
-; LMULMAX1-NEXT: vfwcvt.f.x.v v8, v10
-; LMULMAX1-NEXT: addi a0, a1, 48
-; LMULMAX1-NEXT: vse64.v v8, (a0)
+; LMULMAX1-NEXT: vsext.vf2 v12, v11
+; LMULMAX1-NEXT: vfwcvt.f.x.v v11, v12
+; LMULMAX1-NEXT: vsext.vf2 v12, v8
+; LMULMAX1-NEXT: vfwcvt.f.x.v v8, v12
+; LMULMAX1-NEXT: vsext.vf2 v12, v10
+; LMULMAX1-NEXT: vfwcvt.f.x.v v10, v12
; LMULMAX1-NEXT: addi a0, a1, 32
-; LMULMAX1-NEXT: vse64.v v12, (a0)
-; LMULMAX1-NEXT: vse64.v v11, (a1)
+; LMULMAX1-NEXT: vse64.v v10, (a0)
+; LMULMAX1-NEXT: vse64.v v8, (a1)
+; LMULMAX1-NEXT: addi a0, a1, 48
+; LMULMAX1-NEXT: vse64.v v11, (a0)
; LMULMAX1-NEXT: addi a1, a1, 16
; LMULMAX1-NEXT: vse64.v v9, (a1)
; LMULMAX1-NEXT: ret
@@ -494,23 +493,22 @@ define void @ui2fp_v8i16_v8f64(ptr %x, ptr %y) {
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
; LMULMAX1-NEXT: vzext.vf2 v10, v9
; LMULMAX1-NEXT: vfwcvt.f.xu.v v9, v10
-; LMULMAX1-NEXT: vzext.vf2 v10, v8
-; LMULMAX1-NEXT: vfwcvt.f.xu.v v11, v10
; LMULMAX1-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
-; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v10, v8
-; LMULMAX1-NEXT: vfwcvt.f.xu.v v12, v10
+; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4
; LMULMAX1-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 2
+; LMULMAX1-NEXT: vslidedown.vi v11, v10, 2
; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
-; LMULMAX1-NEXT: vzext.vf2 v10, v8
-; LMULMAX1-NEXT: vfwcvt.f.xu.v v8, v10
-; LMULMAX1-NEXT: addi a0, a1, 48
-; LMULMAX1-NEXT: vse64.v v8, (a0)
+; LMULMAX1-NEXT: vzext.vf2 v12, v11
+; LMULMAX1-NEXT: vfwcvt.f.xu.v v11, v12
+; LMULMAX1-NEXT: vzext.vf2 v12, v8
+; LMULMAX1-NEXT: vfwcvt.f.xu.v v8, v12
+; LMULMAX1-NEXT: vzext.vf2 v12, v10
+; LMULMAX1-NEXT: vfwcvt.f.xu.v v10, v12
; LMULMAX1-NEXT: addi a0, a1, 32
-; LMULMAX1-NEXT: vse64.v v12, (a0)
-; LMULMAX1-NEXT: vse64.v v11, (a1)
+; LMULMAX1-NEXT: vse64.v v10, (a0)
+; LMULMAX1-NEXT: vse64.v v8, (a1)
+; LMULMAX1-NEXT: addi a0, a1, 48
+; LMULMAX1-NEXT: vse64.v v11, (a0)
; LMULMAX1-NEXT: addi a1, a1, 16
; LMULMAX1-NEXT: vse64.v v9, (a1)
; LMULMAX1-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll
index e0aaf9e6407c5b..4686870754cea0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-exttrunc.ll
@@ -90,20 +90,19 @@ define void @sext_v32i8_v32i32(ptr %x, ptr %z) {
; LMULMAX2-NEXT: vslidedown.vi v10, v8, 8
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; LMULMAX2-NEXT: vsext.vf4 v12, v10
-; LMULMAX2-NEXT: vsext.vf4 v10, v8
; LMULMAX2-NEXT: vsetivli zero, 16, e8, m2, ta, ma
-; LMULMAX2-NEXT: vslidedown.vi v8, v8, 16
-; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
-; LMULMAX2-NEXT: vsext.vf4 v14, v8
+; LMULMAX2-NEXT: vslidedown.vi v10, v8, 16
; LMULMAX2-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX2-NEXT: vslidedown.vi v8, v8, 8
+; LMULMAX2-NEXT: vslidedown.vi v9, v10, 8
; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; LMULMAX2-NEXT: vsext.vf4 v14, v9
; LMULMAX2-NEXT: vsext.vf4 v16, v8
-; LMULMAX2-NEXT: addi a0, a1, 96
-; LMULMAX2-NEXT: vse32.v v16, (a0)
+; LMULMAX2-NEXT: vsext.vf4 v8, v10
; LMULMAX2-NEXT: addi a0, a1, 64
+; LMULMAX2-NEXT: vse32.v v8, (a0)
+; LMULMAX2-NEXT: vse32.v v16, (a1)
+; LMULMAX2-NEXT: addi a0, a1, 96
; LMULMAX2-NEXT: vse32.v v14, (a0)
-; LMULMAX2-NEXT: vse32.v v10, (a1)
; LMULMAX2-NEXT: addi a0, a1, 32
; LMULMAX2-NEXT: vse32.v v12, (a0)
; LMULMAX2-NEXT: ret
@@ -118,41 +117,39 @@ define void @sext_v32i8_v32i32(ptr %x, ptr %z) {
; LMULMAX1-NEXT: vslidedown.vi v10, v8, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; LMULMAX1-NEXT: vsext.vf4 v11, v10
-; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v10, v9, 4
-; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v12, v10
-; LMULMAX1-NEXT: vsext.vf4 v10, v8
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 8
+; LMULMAX1-NEXT: vslidedown.vi v10, v8, 8
+; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; LMULMAX1-NEXT: vslidedown.vi v12, v10, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v13, v8
+; LMULMAX1-NEXT: vsext.vf4 v13, v12
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v8, v8, 4
+; LMULMAX1-NEXT: vslidedown.vi v12, v9, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v14, v8
-; LMULMAX1-NEXT: vsext.vf4 v8, v9
+; LMULMAX1-NEXT: vsext.vf4 v14, v12
; LMULMAX1-NEXT: vsetivli zero, 8, e8, m1, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v9, v9, 8
-; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v15, v9
+; LMULMAX1-NEXT: vslidedown.vi v12, v9, 8
; LMULMAX1-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; LMULMAX1-NEXT: vslidedown.vi v9, v9, 4
+; LMULMAX1-NEXT: vslidedown.vi v15, v12, 4
; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; LMULMAX1-NEXT: vsext.vf4 v16, v9
-; LMULMAX1-NEXT: addi a0, a1, 48
-; LMULMAX1-NEXT: vse32.v v16, (a0)
+; LMULMAX1-NEXT: vsext.vf4 v16, v15
+; LMULMAX1-NEXT: vsext.vf4 v15, v8
+; LMULMAX1-NEXT: vsext.vf4 v8, v10
+; LMULMAX1-NEXT: vsext.vf4 v10, v9
+; LMULMAX1-NEXT: vsext.vf4 v9, v12
; LMULMAX1-NEXT: addi a0, a1, 32
-; LMULMAX1-NEXT: vse32.v v15, (a0)
-; LMULMAX1-NEXT: vse32.v v8, (a1)
-; LMULMAX1-NEXT: addi a0, a1, 112
-; LMULMAX1-NEXT: vse32.v v14, (a0)
+; LMULMAX1-NEXT: vse32.v v9, (a0)
+; LMULMAX1-NEXT: vse32.v v10, (a1)
; LMULMAX1-NEXT: addi a0, a1, 96
-; LMULMAX1-NEXT: vse32.v v13, (a0)
+; LMULMAX1-NEXT: vse32.v v8, (a0)
; LMULMAX1-NEXT: addi a0, a1, 64
-; LMULMAX1-NEXT: vse32.v v10, (a0)
+; LMULMAX1-NEXT: vse32.v v15, (a0)
+; LMULMAX1-NEXT: addi a0, a1, 48
+; LMULMAX1-NEXT: vse32.v v16, (a0)
; LMULMAX1-NEXT: addi a0, a1, 16
-; LMULMAX1-NEXT: vse32.v v12, (a0)
+; LMULMAX1-NEXT: vse32.v v14, (a0)
+; LMULMAX1-NEXT: addi a0, a1, 112
+; LMULMAX1-NEXT: vse32.v v13, (a0)
; LMULMAX1-NEXT: addi a0, a1, 80
; LMULMAX1-NEXT: vse32.v v11, (a0)
; LMULMAX1-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index ea7ab339729bc9..79e589a5a5e013 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -780,19 +780,19 @@ define void @sdiv_v6i16(ptr %x, ptr %y) {
; CHECK-LABEL: sdiv_v6i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: vle16.v v9, (a0)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vle16.v v9, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vdiv.vv v10, v8, v9
; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 4
-; CHECK-NEXT: vslidedown.vi v11, v9, 4
+; CHECK-NEXT: vslidedown.vi v9, v9, 4
+; CHECK-NEXT: vslidedown.vi v8, v8, 4
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vdiv.vv v10, v11, v10
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vdiv.vv v8, v9, v8
+; CHECK-NEXT: vdiv.vv v8, v8, v9
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vslideup.vi v10, v8, 4
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
+; CHECK-NEXT: vse16.v v10, (a0)
; CHECK-NEXT: ret
%a = load <6 x i16>, ptr %x
%b = load <6 x i16>, ptr %y
@@ -869,19 +869,19 @@ define void @srem_v6i16(ptr %x, ptr %y) {
; CHECK-LABEL: srem_v6i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: vle16.v v9, (a0)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vle16.v v9, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vrem.vv v10, v8, v9
; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 4
-; CHECK-NEXT: vslidedown.vi v11, v9, 4
+; CHECK-NEXT: vslidedown.vi v9, v9, 4
+; CHECK-NEXT: vslidedown.vi v8, v8, 4
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vrem.vv v10, v11, v10
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vrem.vv v8, v9, v8
+; CHECK-NEXT: vrem.vv v8, v8, v9
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vslideup.vi v10, v8, 4
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
+; CHECK-NEXT: vse16.v v10, (a0)
; CHECK-NEXT: ret
%a = load <6 x i16>, ptr %x
%b = load <6 x i16>, ptr %y
@@ -958,19 +958,19 @@ define void @udiv_v6i16(ptr %x, ptr %y) {
; CHECK-LABEL: udiv_v6i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: vle16.v v9, (a0)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vle16.v v9, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vdivu.vv v10, v8, v9
; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 4
-; CHECK-NEXT: vslidedown.vi v11, v9, 4
+; CHECK-NEXT: vslidedown.vi v9, v9, 4
+; CHECK-NEXT: vslidedown.vi v8, v8, 4
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vdivu.vv v10, v11, v10
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vdivu.vv v8, v9, v8
+; CHECK-NEXT: vdivu.vv v8, v8, v9
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vslideup.vi v10, v8, 4
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
+; CHECK-NEXT: vse16.v v10, (a0)
; CHECK-NEXT: ret
%a = load <6 x i16>, ptr %x
%b = load <6 x i16>, ptr %y
@@ -1047,19 +1047,19 @@ define void @urem_v6i16(ptr %x, ptr %y) {
; CHECK-LABEL: urem_v6i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: vle16.v v9, (a0)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vle16.v v9, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vremu.vv v10, v8, v9
; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 4
-; CHECK-NEXT: vslidedown.vi v11, v9, 4
+; CHECK-NEXT: vslidedown.vi v9, v9, 4
+; CHECK-NEXT: vslidedown.vi v8, v8, 4
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vremu.vv v10, v11, v10
-; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vremu.vv v8, v9, v8
+; CHECK-NEXT: vremu.vv v8, v8, v9
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: vslideup.vi v10, v8, 4
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
+; CHECK-NEXT: vse16.v v10, (a0)
; CHECK-NEXT: ret
%a = load <6 x i16>, ptr %x
%b = load <6 x i16>, ptr %y
@@ -1244,22 +1244,22 @@ define void @mulhu_v6i16(ptr %x) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vid.v v9
-; CHECK-NEXT: vadd.vi v9, v9, 12
-; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 4
-; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
-; CHECK-NEXT: vdivu.vv v9, v10, v9
; CHECK-NEXT: lui a1, %hi(.LCPI67_0)
; CHECK-NEXT: addi a1, a1, %lo(.LCPI67_0)
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v10, (a1)
+; CHECK-NEXT: vle16.v v9, (a1)
+; CHECK-NEXT: vdivu.vv v9, v8, v9
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT: vid.v v10
+; CHECK-NEXT: vadd.vi v10, v10, 12
+; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v8, 4
+; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; CHECK-NEXT: vdivu.vv v8, v8, v10
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vi v8, v9, 4
+; CHECK-NEXT: vslideup.vi v9, v8, 4
; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma
-; CHECK-NEXT: vse16.v v8, (a0)
+; CHECK-NEXT: vse16.v v9, (a0)
; CHECK-NEXT: ret
%a = load <6 x i16>, ptr %x
%b = udiv <6 x i16> %a, <i16 7, i16 9, i16 10, i16 11, i16 12, i16 13>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 2fd904967c8a92..2c12ab6b981761 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -743,71 +743,69 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB12_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB12_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB12_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: vmv.s.x v11, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
+; RV64ZVE32F-NEXT: .LBB12_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB12_13
-; RV64ZVE32F-NEXT: .LBB12_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB12_14
-; RV64ZVE32F-NEXT: .LBB12_7: # %else11
+; RV64ZVE32F-NEXT: .LBB12_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB12_9
-; RV64ZVE32F-NEXT: .LBB12_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB12_10
+; RV64ZVE32F-NEXT: .LBB12_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5
-; RV64ZVE32F-NEXT: .LBB12_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5
+; RV64ZVE32F-NEXT: .LBB12_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB12_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB12_16
-; RV64ZVE32F-NEXT: .LBB12_11: # %else20
+; RV64ZVE32F-NEXT: .LBB12_12: # %else20
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB12_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v11, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, mf2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB12_6
; RV64ZVE32F-NEXT: .LBB12_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB12_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB12_8
; RV64ZVE32F-NEXT: .LBB12_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, mf2, tu, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB12_8
-; RV64ZVE32F-NEXT: j .LBB12_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB12_9
+; RV64ZVE32F-NEXT: j .LBB12_10
; RV64ZVE32F-NEXT: .LBB12_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -816,7 +814,7 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8
; RV64ZVE32F-NEXT: vsetivli zero, 7, e8, mf2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB12_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB12_12
; RV64ZVE32F-NEXT: .LBB12_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -1470,76 +1468,74 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB23_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB23_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB23_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT: vmv.s.x v11, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
+; RV64ZVE32F-NEXT: .LBB23_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB23_13
-; RV64ZVE32F-NEXT: .LBB23_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB23_14
-; RV64ZVE32F-NEXT: .LBB23_7: # %else11
+; RV64ZVE32F-NEXT: .LBB23_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB23_9
-; RV64ZVE32F-NEXT: .LBB23_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB23_10
+; RV64ZVE32F-NEXT: .LBB23_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5
-; RV64ZVE32F-NEXT: .LBB23_9: # %else14
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5
+; RV64ZVE32F-NEXT: .LBB23_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB23_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB23_16
-; RV64ZVE32F-NEXT: .LBB23_11: # %else20
+; RV64ZVE32F-NEXT: .LBB23_12: # %else20
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB23_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v11, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB23_6
; RV64ZVE32F-NEXT: .LBB23_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB23_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB23_8
; RV64ZVE32F-NEXT: .LBB23_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB23_8
-; RV64ZVE32F-NEXT: j .LBB23_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB23_9
+; RV64ZVE32F-NEXT: j .LBB23_10
; RV64ZVE32F-NEXT: .LBB23_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -1549,7 +1545,7 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB23_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB23_12
; RV64ZVE32F-NEXT: .LBB23_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -1615,76 +1611,74 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB24_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB24_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB24_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT: vmv.s.x v11, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
+; RV64ZVE32F-NEXT: .LBB24_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB24_13
-; RV64ZVE32F-NEXT: .LBB24_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB24_14
-; RV64ZVE32F-NEXT: .LBB24_7: # %else11
+; RV64ZVE32F-NEXT: .LBB24_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB24_9
-; RV64ZVE32F-NEXT: .LBB24_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB24_10
+; RV64ZVE32F-NEXT: .LBB24_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5
-; RV64ZVE32F-NEXT: .LBB24_9: # %else14
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5
+; RV64ZVE32F-NEXT: .LBB24_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB24_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB24_16
-; RV64ZVE32F-NEXT: .LBB24_11: # %else20
+; RV64ZVE32F-NEXT: .LBB24_12: # %else20
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB24_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v11, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB24_6
; RV64ZVE32F-NEXT: .LBB24_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB24_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB24_8
; RV64ZVE32F-NEXT: .LBB24_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB24_8
-; RV64ZVE32F-NEXT: j .LBB24_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB24_9
+; RV64ZVE32F-NEXT: j .LBB24_10
; RV64ZVE32F-NEXT: .LBB24_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -1694,7 +1688,7 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB24_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB24_12
; RV64ZVE32F-NEXT: .LBB24_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -1763,80 +1757,78 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB25_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB25_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB25_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: andi a2, a2, 255
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT: vmv.s.x v11, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
+; RV64ZVE32F-NEXT: .LBB25_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB25_13
-; RV64ZVE32F-NEXT: .LBB25_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB25_14
-; RV64ZVE32F-NEXT: .LBB25_7: # %else11
+; RV64ZVE32F-NEXT: .LBB25_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB25_9
-; RV64ZVE32F-NEXT: .LBB25_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB25_10
+; RV64ZVE32F-NEXT: .LBB25_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5
-; RV64ZVE32F-NEXT: .LBB25_9: # %else14
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5
+; RV64ZVE32F-NEXT: .LBB25_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB25_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB25_16
-; RV64ZVE32F-NEXT: .LBB25_11: # %else20
+; RV64ZVE32F-NEXT: .LBB25_12: # %else20
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB25_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: andi a2, a2, 255
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v11, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB25_6
; RV64ZVE32F-NEXT: .LBB25_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB25_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB25_8
; RV64ZVE32F-NEXT: .LBB25_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB25_8
-; RV64ZVE32F-NEXT: j .LBB25_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB25_9
+; RV64ZVE32F-NEXT: j .LBB25_10
; RV64ZVE32F-NEXT: .LBB25_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
@@ -1847,7 +1839,7 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB25_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB25_12
; RV64ZVE32F-NEXT: .LBB25_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -1914,75 +1906,73 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m,
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB26_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB26_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB26_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lh a2, 0(a2)
+; RV64ZVE32F-NEXT: vmv.s.x v11, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
+; RV64ZVE32F-NEXT: .LBB26_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB26_13
-; RV64ZVE32F-NEXT: .LBB26_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB26_14
-; RV64ZVE32F-NEXT: .LBB26_7: # %else11
+; RV64ZVE32F-NEXT: .LBB26_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB26_9
-; RV64ZVE32F-NEXT: .LBB26_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB26_10
+; RV64ZVE32F-NEXT: .LBB26_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5
-; RV64ZVE32F-NEXT: .LBB26_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5
+; RV64ZVE32F-NEXT: .LBB26_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB26_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB26_16
-; RV64ZVE32F-NEXT: .LBB26_11: # %else20
+; RV64ZVE32F-NEXT: .LBB26_12: # %else20
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB26_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lh a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v11, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB26_6
; RV64ZVE32F-NEXT: .LBB26_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB26_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB26_8
; RV64ZVE32F-NEXT: .LBB26_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lh a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB26_8
-; RV64ZVE32F-NEXT: j .LBB26_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB26_9
+; RV64ZVE32F-NEXT: j .LBB26_10
; RV64ZVE32F-NEXT: .LBB26_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -1992,7 +1982,7 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m,
; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB26_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB26_12
; RV64ZVE32F-NEXT: .LBB26_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -2538,58 +2528,56 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB35_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB35_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB35_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB35_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB35_13
-; RV64ZVE32F-NEXT: .LBB35_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB35_14
-; RV64ZVE32F-NEXT: .LBB35_7: # %else11
+; RV64ZVE32F-NEXT: .LBB35_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB35_9
-; RV64ZVE32F-NEXT: .LBB35_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB35_10
+; RV64ZVE32F-NEXT: .LBB35_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB35_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB35_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB35_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB35_16
-; RV64ZVE32F-NEXT: .LBB35_11: # %else20
+; RV64ZVE32F-NEXT: .LBB35_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB35_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB35_6
; RV64ZVE32F-NEXT: .LBB35_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
@@ -2598,19 +2586,19 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB35_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB35_8
; RV64ZVE32F-NEXT: .LBB35_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB35_8
-; RV64ZVE32F-NEXT: j .LBB35_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB35_9
+; RV64ZVE32F-NEXT: j .LBB35_10
; RV64ZVE32F-NEXT: .LBB35_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -2621,7 +2609,7 @@ define <8 x i32> @mgather_baseidx_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB35_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB35_12
; RV64ZVE32F-NEXT: .LBB35_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -2688,58 +2676,56 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB36_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB36_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB36_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB36_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB36_13
-; RV64ZVE32F-NEXT: .LBB36_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB36_14
-; RV64ZVE32F-NEXT: .LBB36_7: # %else11
+; RV64ZVE32F-NEXT: .LBB36_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB36_9
-; RV64ZVE32F-NEXT: .LBB36_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB36_10
+; RV64ZVE32F-NEXT: .LBB36_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB36_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB36_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB36_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB36_16
-; RV64ZVE32F-NEXT: .LBB36_11: # %else20
+; RV64ZVE32F-NEXT: .LBB36_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB36_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB36_6
; RV64ZVE32F-NEXT: .LBB36_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
@@ -2748,19 +2734,19 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB36_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB36_8
; RV64ZVE32F-NEXT: .LBB36_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB36_8
-; RV64ZVE32F-NEXT: j .LBB36_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB36_9
+; RV64ZVE32F-NEXT: j .LBB36_10
; RV64ZVE32F-NEXT: .LBB36_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -2771,7 +2757,7 @@ define <8 x i32> @mgather_baseidx_sext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB36_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB36_12
; RV64ZVE32F-NEXT: .LBB36_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -2841,83 +2827,81 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB37_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB37_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB37_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: andi a2, a2, 255
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB37_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB37_13
-; RV64ZVE32F-NEXT: .LBB37_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB37_14
-; RV64ZVE32F-NEXT: .LBB37_7: # %else11
+; RV64ZVE32F-NEXT: .LBB37_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB37_9
-; RV64ZVE32F-NEXT: .LBB37_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB37_10
+; RV64ZVE32F-NEXT: .LBB37_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB37_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB37_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB37_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB37_16
-; RV64ZVE32F-NEXT: .LBB37_11: # %else20
+; RV64ZVE32F-NEXT: .LBB37_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB37_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: andi a2, a2, 255
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB37_6
; RV64ZVE32F-NEXT: .LBB37_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB37_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB37_8
; RV64ZVE32F-NEXT: .LBB37_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB37_8
-; RV64ZVE32F-NEXT: j .LBB37_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB37_9
+; RV64ZVE32F-NEXT: j .LBB37_10
; RV64ZVE32F-NEXT: .LBB37_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
@@ -2929,7 +2913,7 @@ define <8 x i32> @mgather_baseidx_zext_v8i8_v8i32(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB37_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB37_12
; RV64ZVE32F-NEXT: .LBB37_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -2999,58 +2983,56 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB38_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB38_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB38_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB38_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB38_13
-; RV64ZVE32F-NEXT: .LBB38_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB38_14
-; RV64ZVE32F-NEXT: .LBB38_7: # %else11
+; RV64ZVE32F-NEXT: .LBB38_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB38_9
-; RV64ZVE32F-NEXT: .LBB38_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB38_10
+; RV64ZVE32F-NEXT: .LBB38_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB38_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB38_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB38_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB38_16
-; RV64ZVE32F-NEXT: .LBB38_11: # %else20
+; RV64ZVE32F-NEXT: .LBB38_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB38_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB38_6
; RV64ZVE32F-NEXT: .LBB38_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
@@ -3059,19 +3041,19 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB38_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB38_8
; RV64ZVE32F-NEXT: .LBB38_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB38_8
-; RV64ZVE32F-NEXT: j .LBB38_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB38_9
+; RV64ZVE32F-NEXT: j .LBB38_10
; RV64ZVE32F-NEXT: .LBB38_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -3082,7 +3064,7 @@ define <8 x i32> @mgather_baseidx_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <8 x i
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB38_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB38_12
; RV64ZVE32F-NEXT: .LBB38_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -3150,58 +3132,56 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB39_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB39_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB39_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lw a2, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB39_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB39_13
-; RV64ZVE32F-NEXT: .LBB39_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB39_14
-; RV64ZVE32F-NEXT: .LBB39_7: # %else11
+; RV64ZVE32F-NEXT: .LBB39_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB39_9
-; RV64ZVE32F-NEXT: .LBB39_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB39_10
+; RV64ZVE32F-NEXT: .LBB39_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB39_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB39_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB39_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB39_16
-; RV64ZVE32F-NEXT: .LBB39_11: # %else20
+; RV64ZVE32F-NEXT: .LBB39_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB39_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lw a2, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB39_6
; RV64ZVE32F-NEXT: .LBB39_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
@@ -3210,19 +3190,19 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB39_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB39_8
; RV64ZVE32F-NEXT: .LBB39_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lw a2, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB39_8
-; RV64ZVE32F-NEXT: j .LBB39_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB39_9
+; RV64ZVE32F-NEXT: j .LBB39_10
; RV64ZVE32F-NEXT: .LBB39_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -3233,7 +3213,7 @@ define <8 x i32> @mgather_baseidx_sext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB39_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB39_12
; RV64ZVE32F-NEXT: .LBB39_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -3306,60 +3286,58 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB40_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB40_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB40_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: and a3, a3, a1
+; RV64ZVE32F-NEXT: slli a3, a3, 2
+; RV64ZVE32F-NEXT: add a3, a0, a3
+; RV64ZVE32F-NEXT: lw a3, 0(a3)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv.s.x v12, a3
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB40_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB40_13
-; RV64ZVE32F-NEXT: .LBB40_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB40_14
-; RV64ZVE32F-NEXT: .LBB40_7: # %else11
+; RV64ZVE32F-NEXT: .LBB40_8: # %else11
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB40_9
-; RV64ZVE32F-NEXT: .LBB40_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a3, .LBB40_10
+; RV64ZVE32F-NEXT: .LBB40_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: lw a3, 0(a3)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a3
+; RV64ZVE32F-NEXT: vmv.s.x v12, a3
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB40_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB40_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a3, .LBB40_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a2, a2, -128
; RV64ZVE32F-NEXT: bnez a2, .LBB40_16
-; RV64ZVE32F-NEXT: .LBB40_11: # %else20
+; RV64ZVE32F-NEXT: .LBB40_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB40_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 2
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: lw a3, 0(a3)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a3
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB40_6
; RV64ZVE32F-NEXT: .LBB40_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
@@ -3369,20 +3347,20 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB40_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB40_8
; RV64ZVE32F-NEXT: .LBB40_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: lw a3, 0(a3)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vmv.s.x v8, a3
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vmv.s.x v12, a3
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB40_8
-; RV64ZVE32F-NEXT: j .LBB40_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB40_9
+; RV64ZVE32F-NEXT: j .LBB40_10
; RV64ZVE32F-NEXT: .LBB40_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: and a3, a3, a1
@@ -3394,7 +3372,7 @@ define <8 x i32> @mgather_baseidx_zext_v8i16_v8i32(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB40_11
+; RV64ZVE32F-NEXT: beqz a2, .LBB40_12
; RV64ZVE32F-NEXT: .LBB40_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -4383,22 +4361,22 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-LABEL: mgather_baseidx_v8i8_v8i64:
; RV64ZVE32F: # %bb.0:
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a5, v0
-; RV64ZVE32F-NEXT: andi a3, a5, 1
+; RV64ZVE32F-NEXT: vmv.x.s a6, v0
+; RV64ZVE32F-NEXT: andi a3, a6, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB48_3
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: ld a3, 0(a3)
-; RV64ZVE32F-NEXT: andi a4, a5, 2
+; RV64ZVE32F-NEXT: andi a4, a6, 2
; RV64ZVE32F-NEXT: bnez a4, .LBB48_4
; RV64ZVE32F-NEXT: .LBB48_2:
; RV64ZVE32F-NEXT: ld a4, 8(a2)
; RV64ZVE32F-NEXT: j .LBB48_5
; RV64ZVE32F-NEXT: .LBB48_3:
; RV64ZVE32F-NEXT: ld a3, 0(a2)
-; RV64ZVE32F-NEXT: andi a4, a5, 2
+; RV64ZVE32F-NEXT: andi a4, a6, 2
; RV64ZVE32F-NEXT: beqz a4, .LBB48_2
; RV64ZVE32F-NEXT: .LBB48_4: # %cond.load1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
@@ -4408,83 +4386,87 @@ define <8 x i64> @mgather_baseidx_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8 x i1>
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: ld a4, 0(a4)
; RV64ZVE32F-NEXT: .LBB48_5: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: andi a6, a5, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: beqz a6, .LBB48_10
+; RV64ZVE32F-NEXT: andi a5, a6, 4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a5, .LBB48_7
; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a6, v8
-; RV64ZVE32F-NEXT: slli a6, a6, 3
-; RV64ZVE32F-NEXT: add a6, a1, a6
-; RV64ZVE32F-NEXT: ld a6, 0(a6)
-; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: bnez a7, .LBB48_11
+; RV64ZVE32F-NEXT: vmv.x.s a5, v9
+; RV64ZVE32F-NEXT: slli a5, a5, 3
+; RV64ZVE32F-NEXT: add a5, a1, a5
+; RV64ZVE32F-NEXT: ld a5, 0(a5)
+; RV64ZVE32F-NEXT: j .LBB48_8
; RV64ZVE32F-NEXT: .LBB48_7:
-; RV64ZVE32F-NEXT: ld a7, 24(a2)
-; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: bnez t0, .LBB48_12
-; RV64ZVE32F-NEXT: .LBB48_8:
-; RV64ZVE32F-NEXT: ld t0, 32(a2)
-; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: bnez t1, .LBB48_13
-; RV64ZVE32F-NEXT: .LBB48_9:
-; RV64ZVE32F-NEXT: ld t1, 40(a2)
-; RV64ZVE32F-NEXT: j .LBB48_14
-; RV64ZVE32F-NEXT: .LBB48_10:
-; RV64ZVE32F-NEXT: ld a6, 16(a2)
-; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: beqz a7, .LBB48_7
-; RV64ZVE32F-NEXT: .LBB48_11: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a7, v8
+; RV64ZVE32F-NEXT: ld a5, 16(a2)
+; RV64ZVE32F-NEXT: .LBB48_8: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: andi a7, a6, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
+; RV64ZVE32F-NEXT: beqz a7, .LBB48_12
+; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a7, v9
; RV64ZVE32F-NEXT: slli a7, a7, 3
; RV64ZVE32F-NEXT: add a7, a1, a7
; RV64ZVE32F-NEXT: ld a7, 0(a7)
-; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: beqz t0, .LBB48_8
-; RV64ZVE32F-NEXT: .LBB48_12: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s t0, v9
-; RV64ZVE32F-NEXT: slli t0, t0, 3
-; RV64ZVE32F-NEXT: add t0, a1, t0
-; RV64ZVE32F-NEXT: ld t0, 0(t0)
-; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: beqz t1, .LBB48_9
-; RV64ZVE32F-NEXT: .LBB48_13: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s t1, v8
-; RV64ZVE32F-NEXT: slli t1, t1, 3
+; RV64ZVE32F-NEXT: andi t0, a6, 16
+; RV64ZVE32F-NEXT: bnez t0, .LBB48_13
+; RV64ZVE32F-NEXT: .LBB48_10:
+; RV64ZVE32F-NEXT: ld t0, 32(a2)
+; RV64ZVE32F-NEXT: andi t1, a6, 32
+; RV64ZVE32F-NEXT: bnez t1, .LBB48_14
+; RV64ZVE32F-NEXT: .LBB48_11:
+; RV64ZVE32F-NEXT: ld t1, 40(a2)
+; RV64ZVE32F-NEXT: j .LBB48_15
+; RV64ZVE32F-NEXT: .LBB48_12:
+; RV64ZVE32F-NEXT: ld a7, 24(a2)
+; RV64ZVE32F-NEXT: andi t0, a6, 16
+; RV64ZVE32F-NEXT: beqz t0, .LBB48_10
+; RV64ZVE32F-NEXT: .LBB48_13: # %cond.load10
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s t0, v8
+; RV64ZVE32F-NEXT: slli t0, t0, 3
+; RV64ZVE32F-NEXT: add t0, a1, t0
+; RV64ZVE32F-NEXT: ld t0, 0(t0)
+; RV64ZVE32F-NEXT: andi t1, a6, 32
+; RV64ZVE32F-NEXT: beqz t1, .LBB48_11
+; RV64ZVE32F-NEXT: .LBB48_14: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s t1, v9
+; RV64ZVE32F-NEXT: slli t1, t1, 3
; RV64ZVE32F-NEXT: add t1, a1, t1
; RV64ZVE32F-NEXT: ld t1, 0(t1)
-; RV64ZVE32F-NEXT: .LBB48_14: # %else14
-; RV64ZVE32F-NEXT: andi t2, a5, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz t2, .LBB48_17
-; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16
+; RV64ZVE32F-NEXT: .LBB48_15: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: andi t2, a6, 64
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz t2, .LBB48_18
+; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s t2, v8
; RV64ZVE32F-NEXT: slli t2, t2, 3
; RV64ZVE32F-NEXT: add t2, a1, t2
; RV64ZVE32F-NEXT: ld t2, 0(t2)
-; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: bnez a5, .LBB48_18
-; RV64ZVE32F-NEXT: .LBB48_16:
-; RV64ZVE32F-NEXT: ld a1, 56(a2)
-; RV64ZVE32F-NEXT: j .LBB48_19
+; RV64ZVE32F-NEXT: andi a6, a6, -128
+; RV64ZVE32F-NEXT: bnez a6, .LBB48_19
; RV64ZVE32F-NEXT: .LBB48_17:
+; RV64ZVE32F-NEXT: ld a1, 56(a2)
+; RV64ZVE32F-NEXT: j .LBB48_20
+; RV64ZVE32F-NEXT: .LBB48_18:
; RV64ZVE32F-NEXT: ld t2, 48(a2)
-; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: beqz a5, .LBB48_16
-; RV64ZVE32F-NEXT: .LBB48_18: # %cond.load19
+; RV64ZVE32F-NEXT: andi a6, a6, -128
+; RV64ZVE32F-NEXT: beqz a6, .LBB48_17
+; RV64ZVE32F-NEXT: .LBB48_19: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: ld a1, 0(a1)
-; RV64ZVE32F-NEXT: .LBB48_19: # %else20
+; RV64ZVE32F-NEXT: .LBB48_20: # %else20
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: sd a4, 8(a0)
-; RV64ZVE32F-NEXT: sd a6, 16(a0)
+; RV64ZVE32F-NEXT: sd a5, 16(a0)
; RV64ZVE32F-NEXT: sd a7, 24(a0)
; RV64ZVE32F-NEXT: sd t0, 32(a0)
; RV64ZVE32F-NEXT: sd t1, 40(a0)
@@ -4657,22 +4639,22 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-LABEL: mgather_baseidx_sext_v8i8_v8i64:
; RV64ZVE32F: # %bb.0:
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a5, v0
-; RV64ZVE32F-NEXT: andi a3, a5, 1
+; RV64ZVE32F-NEXT: vmv.x.s a6, v0
+; RV64ZVE32F-NEXT: andi a3, a6, 1
; RV64ZVE32F-NEXT: beqz a3, .LBB49_3
; RV64ZVE32F-NEXT: # %bb.1: # %cond.load
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: ld a3, 0(a3)
-; RV64ZVE32F-NEXT: andi a4, a5, 2
+; RV64ZVE32F-NEXT: andi a4, a6, 2
; RV64ZVE32F-NEXT: bnez a4, .LBB49_4
; RV64ZVE32F-NEXT: .LBB49_2:
; RV64ZVE32F-NEXT: ld a4, 8(a2)
; RV64ZVE32F-NEXT: j .LBB49_5
; RV64ZVE32F-NEXT: .LBB49_3:
; RV64ZVE32F-NEXT: ld a3, 0(a2)
-; RV64ZVE32F-NEXT: andi a4, a5, 2
+; RV64ZVE32F-NEXT: andi a4, a6, 2
; RV64ZVE32F-NEXT: beqz a4, .LBB49_2
; RV64ZVE32F-NEXT: .LBB49_4: # %cond.load1
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
@@ -4682,83 +4664,87 @@ define <8 x i64> @mgather_baseidx_sext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: ld a4, 0(a4)
; RV64ZVE32F-NEXT: .LBB49_5: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: andi a6, a5, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: beqz a6, .LBB49_10
+; RV64ZVE32F-NEXT: andi a5, a6, 4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a5, .LBB49_7
; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a6, v8
-; RV64ZVE32F-NEXT: slli a6, a6, 3
-; RV64ZVE32F-NEXT: add a6, a1, a6
-; RV64ZVE32F-NEXT: ld a6, 0(a6)
-; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: bnez a7, .LBB49_11
+; RV64ZVE32F-NEXT: vmv.x.s a5, v9
+; RV64ZVE32F-NEXT: slli a5, a5, 3
+; RV64ZVE32F-NEXT: add a5, a1, a5
+; RV64ZVE32F-NEXT: ld a5, 0(a5)
+; RV64ZVE32F-NEXT: j .LBB49_8
; RV64ZVE32F-NEXT: .LBB49_7:
-; RV64ZVE32F-NEXT: ld a7, 24(a2)
-; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: bnez t0, .LBB49_12
-; RV64ZVE32F-NEXT: .LBB49_8:
-; RV64ZVE32F-NEXT: ld t0, 32(a2)
-; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: bnez t1, .LBB49_13
-; RV64ZVE32F-NEXT: .LBB49_9:
-; RV64ZVE32F-NEXT: ld t1, 40(a2)
-; RV64ZVE32F-NEXT: j .LBB49_14
-; RV64ZVE32F-NEXT: .LBB49_10:
-; RV64ZVE32F-NEXT: ld a6, 16(a2)
-; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: beqz a7, .LBB49_7
-; RV64ZVE32F-NEXT: .LBB49_11: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a7, v8
+; RV64ZVE32F-NEXT: ld a5, 16(a2)
+; RV64ZVE32F-NEXT: .LBB49_8: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: andi a7, a6, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
+; RV64ZVE32F-NEXT: beqz a7, .LBB49_12
+; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a7, v9
; RV64ZVE32F-NEXT: slli a7, a7, 3
; RV64ZVE32F-NEXT: add a7, a1, a7
; RV64ZVE32F-NEXT: ld a7, 0(a7)
-; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: beqz t0, .LBB49_8
-; RV64ZVE32F-NEXT: .LBB49_12: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s t0, v9
+; RV64ZVE32F-NEXT: andi t0, a6, 16
+; RV64ZVE32F-NEXT: bnez t0, .LBB49_13
+; RV64ZVE32F-NEXT: .LBB49_10:
+; RV64ZVE32F-NEXT: ld t0, 32(a2)
+; RV64ZVE32F-NEXT: andi t1, a6, 32
+; RV64ZVE32F-NEXT: bnez t1, .LBB49_14
+; RV64ZVE32F-NEXT: .LBB49_11:
+; RV64ZVE32F-NEXT: ld t1, 40(a2)
+; RV64ZVE32F-NEXT: j .LBB49_15
+; RV64ZVE32F-NEXT: .LBB49_12:
+; RV64ZVE32F-NEXT: ld a7, 24(a2)
+; RV64ZVE32F-NEXT: andi t0, a6, 16
+; RV64ZVE32F-NEXT: beqz t0, .LBB49_10
+; RV64ZVE32F-NEXT: .LBB49_13: # %cond.load10
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s t0, v8
; RV64ZVE32F-NEXT: slli t0, t0, 3
; RV64ZVE32F-NEXT: add t0, a1, t0
; RV64ZVE32F-NEXT: ld t0, 0(t0)
-; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: beqz t1, .LBB49_9
-; RV64ZVE32F-NEXT: .LBB49_13: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s t1, v8
+; RV64ZVE32F-NEXT: andi t1, a6, 32
+; RV64ZVE32F-NEXT: beqz t1, .LBB49_11
+; RV64ZVE32F-NEXT: .LBB49_14: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s t1, v9
; RV64ZVE32F-NEXT: slli t1, t1, 3
; RV64ZVE32F-NEXT: add t1, a1, t1
; RV64ZVE32F-NEXT: ld t1, 0(t1)
-; RV64ZVE32F-NEXT: .LBB49_14: # %else14
-; RV64ZVE32F-NEXT: andi t2, a5, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz t2, .LBB49_17
-; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16
+; RV64ZVE32F-NEXT: .LBB49_15: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: andi t2, a6, 64
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz t2, .LBB49_18
+; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s t2, v8
; RV64ZVE32F-NEXT: slli t2, t2, 3
; RV64ZVE32F-NEXT: add t2, a1, t2
; RV64ZVE32F-NEXT: ld t2, 0(t2)
-; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: bnez a5, .LBB49_18
-; RV64ZVE32F-NEXT: .LBB49_16:
-; RV64ZVE32F-NEXT: ld a1, 56(a2)
-; RV64ZVE32F-NEXT: j .LBB49_19
+; RV64ZVE32F-NEXT: andi a6, a6, -128
+; RV64ZVE32F-NEXT: bnez a6, .LBB49_19
; RV64ZVE32F-NEXT: .LBB49_17:
+; RV64ZVE32F-NEXT: ld a1, 56(a2)
+; RV64ZVE32F-NEXT: j .LBB49_20
+; RV64ZVE32F-NEXT: .LBB49_18:
; RV64ZVE32F-NEXT: ld t2, 48(a2)
-; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: beqz a5, .LBB49_16
-; RV64ZVE32F-NEXT: .LBB49_18: # %cond.load19
+; RV64ZVE32F-NEXT: andi a6, a6, -128
+; RV64ZVE32F-NEXT: beqz a6, .LBB49_17
+; RV64ZVE32F-NEXT: .LBB49_19: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: ld a1, 0(a1)
-; RV64ZVE32F-NEXT: .LBB49_19: # %else20
+; RV64ZVE32F-NEXT: .LBB49_20: # %else20
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: sd a4, 8(a0)
-; RV64ZVE32F-NEXT: sd a6, 16(a0)
+; RV64ZVE32F-NEXT: sd a5, 16(a0)
; RV64ZVE32F-NEXT: sd a7, 24(a0)
; RV64ZVE32F-NEXT: sd t0, 32(a0)
; RV64ZVE32F-NEXT: sd t1, 40(a0)
@@ -4959,86 +4945,90 @@ define <8 x i64> @mgather_baseidx_zext_v8i8_v8i64(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: ld a4, 0(a4)
; RV64ZVE32F-NEXT: .LBB50_5: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a6, a5, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: beqz a6, .LBB50_10
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a6, .LBB50_7
; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a6, v8
+; RV64ZVE32F-NEXT: vmv.x.s a6, v9
; RV64ZVE32F-NEXT: andi a6, a6, 255
; RV64ZVE32F-NEXT: slli a6, a6, 3
; RV64ZVE32F-NEXT: add a6, a1, a6
; RV64ZVE32F-NEXT: ld a6, 0(a6)
-; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: bnez a7, .LBB50_11
+; RV64ZVE32F-NEXT: j .LBB50_8
; RV64ZVE32F-NEXT: .LBB50_7:
-; RV64ZVE32F-NEXT: ld a7, 24(a2)
-; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: bnez t0, .LBB50_12
-; RV64ZVE32F-NEXT: .LBB50_8:
-; RV64ZVE32F-NEXT: ld t0, 32(a2)
-; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: bnez t1, .LBB50_13
-; RV64ZVE32F-NEXT: .LBB50_9:
-; RV64ZVE32F-NEXT: ld t1, 40(a2)
-; RV64ZVE32F-NEXT: j .LBB50_14
-; RV64ZVE32F-NEXT: .LBB50_10:
; RV64ZVE32F-NEXT: ld a6, 16(a2)
+; RV64ZVE32F-NEXT: .LBB50_8: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: beqz a7, .LBB50_7
-; RV64ZVE32F-NEXT: .LBB50_11: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a7, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
+; RV64ZVE32F-NEXT: beqz a7, .LBB50_12
+; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a7, v9
; RV64ZVE32F-NEXT: andi a7, a7, 255
; RV64ZVE32F-NEXT: slli a7, a7, 3
; RV64ZVE32F-NEXT: add a7, a1, a7
; RV64ZVE32F-NEXT: ld a7, 0(a7)
; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: beqz t0, .LBB50_8
-; RV64ZVE32F-NEXT: .LBB50_12: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s t0, v9
+; RV64ZVE32F-NEXT: bnez t0, .LBB50_13
+; RV64ZVE32F-NEXT: .LBB50_10:
+; RV64ZVE32F-NEXT: ld t0, 32(a2)
+; RV64ZVE32F-NEXT: andi t1, a5, 32
+; RV64ZVE32F-NEXT: bnez t1, .LBB50_14
+; RV64ZVE32F-NEXT: .LBB50_11:
+; RV64ZVE32F-NEXT: ld t1, 40(a2)
+; RV64ZVE32F-NEXT: j .LBB50_15
+; RV64ZVE32F-NEXT: .LBB50_12:
+; RV64ZVE32F-NEXT: ld a7, 24(a2)
+; RV64ZVE32F-NEXT: andi t0, a5, 16
+; RV64ZVE32F-NEXT: beqz t0, .LBB50_10
+; RV64ZVE32F-NEXT: .LBB50_13: # %cond.load10
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s t0, v8
; RV64ZVE32F-NEXT: andi t0, t0, 255
; RV64ZVE32F-NEXT: slli t0, t0, 3
; RV64ZVE32F-NEXT: add t0, a1, t0
; RV64ZVE32F-NEXT: ld t0, 0(t0)
; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: beqz t1, .LBB50_9
-; RV64ZVE32F-NEXT: .LBB50_13: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s t1, v8
+; RV64ZVE32F-NEXT: beqz t1, .LBB50_11
+; RV64ZVE32F-NEXT: .LBB50_14: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s t1, v9
; RV64ZVE32F-NEXT: andi t1, t1, 255
; RV64ZVE32F-NEXT: slli t1, t1, 3
; RV64ZVE32F-NEXT: add t1, a1, t1
; RV64ZVE32F-NEXT: ld t1, 0(t1)
-; RV64ZVE32F-NEXT: .LBB50_14: # %else14
+; RV64ZVE32F-NEXT: .LBB50_15: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi t2, a5, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz t2, .LBB50_17
-; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz t2, .LBB50_18
+; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s t2, v8
; RV64ZVE32F-NEXT: andi t2, t2, 255
; RV64ZVE32F-NEXT: slli t2, t2, 3
; RV64ZVE32F-NEXT: add t2, a1, t2
; RV64ZVE32F-NEXT: ld t2, 0(t2)
; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: bnez a5, .LBB50_18
-; RV64ZVE32F-NEXT: .LBB50_16:
-; RV64ZVE32F-NEXT: ld a1, 56(a2)
-; RV64ZVE32F-NEXT: j .LBB50_19
+; RV64ZVE32F-NEXT: bnez a5, .LBB50_19
; RV64ZVE32F-NEXT: .LBB50_17:
+; RV64ZVE32F-NEXT: ld a1, 56(a2)
+; RV64ZVE32F-NEXT: j .LBB50_20
+; RV64ZVE32F-NEXT: .LBB50_18:
; RV64ZVE32F-NEXT: ld t2, 48(a2)
; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: beqz a5, .LBB50_16
-; RV64ZVE32F-NEXT: .LBB50_18: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a5, .LBB50_17
+; RV64ZVE32F-NEXT: .LBB50_19: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: ld a1, 0(a1)
-; RV64ZVE32F-NEXT: .LBB50_19: # %else20
+; RV64ZVE32F-NEXT: .LBB50_20: # %else20
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: sd a4, 8(a0)
; RV64ZVE32F-NEXT: sd a6, 16(a0)
@@ -5241,80 +5231,84 @@ define <8 x i64> @mgather_baseidx_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <8 x i
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: ld a4, 0(a4)
; RV64ZVE32F-NEXT: .LBB51_5: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a6, a5, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: beqz a6, .LBB51_10
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a6, .LBB51_7
; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a6, v8
+; RV64ZVE32F-NEXT: vmv.x.s a6, v9
; RV64ZVE32F-NEXT: slli a6, a6, 3
; RV64ZVE32F-NEXT: add a6, a1, a6
; RV64ZVE32F-NEXT: ld a6, 0(a6)
-; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: bnez a7, .LBB51_11
+; RV64ZVE32F-NEXT: j .LBB51_8
; RV64ZVE32F-NEXT: .LBB51_7:
-; RV64ZVE32F-NEXT: ld a7, 24(a2)
-; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: bnez t0, .LBB51_12
-; RV64ZVE32F-NEXT: .LBB51_8:
-; RV64ZVE32F-NEXT: ld t0, 32(a2)
-; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: bnez t1, .LBB51_13
-; RV64ZVE32F-NEXT: .LBB51_9:
-; RV64ZVE32F-NEXT: ld t1, 40(a2)
-; RV64ZVE32F-NEXT: j .LBB51_14
-; RV64ZVE32F-NEXT: .LBB51_10:
; RV64ZVE32F-NEXT: ld a6, 16(a2)
+; RV64ZVE32F-NEXT: .LBB51_8: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: beqz a7, .LBB51_7
-; RV64ZVE32F-NEXT: .LBB51_11: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a7, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
+; RV64ZVE32F-NEXT: beqz a7, .LBB51_12
+; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a7, v9
; RV64ZVE32F-NEXT: slli a7, a7, 3
; RV64ZVE32F-NEXT: add a7, a1, a7
; RV64ZVE32F-NEXT: ld a7, 0(a7)
; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: beqz t0, .LBB51_8
-; RV64ZVE32F-NEXT: .LBB51_12: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s t0, v9
+; RV64ZVE32F-NEXT: bnez t0, .LBB51_13
+; RV64ZVE32F-NEXT: .LBB51_10:
+; RV64ZVE32F-NEXT: ld t0, 32(a2)
+; RV64ZVE32F-NEXT: andi t1, a5, 32
+; RV64ZVE32F-NEXT: bnez t1, .LBB51_14
+; RV64ZVE32F-NEXT: .LBB51_11:
+; RV64ZVE32F-NEXT: ld t1, 40(a2)
+; RV64ZVE32F-NEXT: j .LBB51_15
+; RV64ZVE32F-NEXT: .LBB51_12:
+; RV64ZVE32F-NEXT: ld a7, 24(a2)
+; RV64ZVE32F-NEXT: andi t0, a5, 16
+; RV64ZVE32F-NEXT: beqz t0, .LBB51_10
+; RV64ZVE32F-NEXT: .LBB51_13: # %cond.load10
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s t0, v8
; RV64ZVE32F-NEXT: slli t0, t0, 3
; RV64ZVE32F-NEXT: add t0, a1, t0
; RV64ZVE32F-NEXT: ld t0, 0(t0)
; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: beqz t1, .LBB51_9
-; RV64ZVE32F-NEXT: .LBB51_13: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s t1, v8
+; RV64ZVE32F-NEXT: beqz t1, .LBB51_11
+; RV64ZVE32F-NEXT: .LBB51_14: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s t1, v9
; RV64ZVE32F-NEXT: slli t1, t1, 3
; RV64ZVE32F-NEXT: add t1, a1, t1
; RV64ZVE32F-NEXT: ld t1, 0(t1)
-; RV64ZVE32F-NEXT: .LBB51_14: # %else14
+; RV64ZVE32F-NEXT: .LBB51_15: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi t2, a5, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz t2, .LBB51_17
-; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz t2, .LBB51_18
+; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s t2, v8
; RV64ZVE32F-NEXT: slli t2, t2, 3
; RV64ZVE32F-NEXT: add t2, a1, t2
; RV64ZVE32F-NEXT: ld t2, 0(t2)
; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: bnez a5, .LBB51_18
-; RV64ZVE32F-NEXT: .LBB51_16:
-; RV64ZVE32F-NEXT: ld a1, 56(a2)
-; RV64ZVE32F-NEXT: j .LBB51_19
+; RV64ZVE32F-NEXT: bnez a5, .LBB51_19
; RV64ZVE32F-NEXT: .LBB51_17:
+; RV64ZVE32F-NEXT: ld a1, 56(a2)
+; RV64ZVE32F-NEXT: j .LBB51_20
+; RV64ZVE32F-NEXT: .LBB51_18:
; RV64ZVE32F-NEXT: ld t2, 48(a2)
; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: beqz a5, .LBB51_16
-; RV64ZVE32F-NEXT: .LBB51_18: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a5, .LBB51_17
+; RV64ZVE32F-NEXT: .LBB51_19: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: ld a1, 0(a1)
-; RV64ZVE32F-NEXT: .LBB51_19: # %else20
+; RV64ZVE32F-NEXT: .LBB51_20: # %else20
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: sd a4, 8(a0)
; RV64ZVE32F-NEXT: sd a6, 16(a0)
@@ -5516,80 +5510,84 @@ define <8 x i64> @mgather_baseidx_sext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: ld a4, 0(a4)
; RV64ZVE32F-NEXT: .LBB52_5: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a6, a5, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: beqz a6, .LBB52_10
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a6, .LBB52_7
; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a6, v8
+; RV64ZVE32F-NEXT: vmv.x.s a6, v9
; RV64ZVE32F-NEXT: slli a6, a6, 3
; RV64ZVE32F-NEXT: add a6, a1, a6
; RV64ZVE32F-NEXT: ld a6, 0(a6)
-; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: bnez a7, .LBB52_11
+; RV64ZVE32F-NEXT: j .LBB52_8
; RV64ZVE32F-NEXT: .LBB52_7:
-; RV64ZVE32F-NEXT: ld a7, 24(a2)
-; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: bnez t0, .LBB52_12
-; RV64ZVE32F-NEXT: .LBB52_8:
-; RV64ZVE32F-NEXT: ld t0, 32(a2)
-; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: bnez t1, .LBB52_13
-; RV64ZVE32F-NEXT: .LBB52_9:
-; RV64ZVE32F-NEXT: ld t1, 40(a2)
-; RV64ZVE32F-NEXT: j .LBB52_14
-; RV64ZVE32F-NEXT: .LBB52_10:
; RV64ZVE32F-NEXT: ld a6, 16(a2)
+; RV64ZVE32F-NEXT: .LBB52_8: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a7, a5, 8
-; RV64ZVE32F-NEXT: beqz a7, .LBB52_7
-; RV64ZVE32F-NEXT: .LBB52_11: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a7, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
+; RV64ZVE32F-NEXT: beqz a7, .LBB52_12
+; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a7, v9
; RV64ZVE32F-NEXT: slli a7, a7, 3
; RV64ZVE32F-NEXT: add a7, a1, a7
; RV64ZVE32F-NEXT: ld a7, 0(a7)
; RV64ZVE32F-NEXT: andi t0, a5, 16
-; RV64ZVE32F-NEXT: beqz t0, .LBB52_8
-; RV64ZVE32F-NEXT: .LBB52_12: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s t0, v9
+; RV64ZVE32F-NEXT: bnez t0, .LBB52_13
+; RV64ZVE32F-NEXT: .LBB52_10:
+; RV64ZVE32F-NEXT: ld t0, 32(a2)
+; RV64ZVE32F-NEXT: andi t1, a5, 32
+; RV64ZVE32F-NEXT: bnez t1, .LBB52_14
+; RV64ZVE32F-NEXT: .LBB52_11:
+; RV64ZVE32F-NEXT: ld t1, 40(a2)
+; RV64ZVE32F-NEXT: j .LBB52_15
+; RV64ZVE32F-NEXT: .LBB52_12:
+; RV64ZVE32F-NEXT: ld a7, 24(a2)
+; RV64ZVE32F-NEXT: andi t0, a5, 16
+; RV64ZVE32F-NEXT: beqz t0, .LBB52_10
+; RV64ZVE32F-NEXT: .LBB52_13: # %cond.load10
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s t0, v8
; RV64ZVE32F-NEXT: slli t0, t0, 3
; RV64ZVE32F-NEXT: add t0, a1, t0
; RV64ZVE32F-NEXT: ld t0, 0(t0)
; RV64ZVE32F-NEXT: andi t1, a5, 32
-; RV64ZVE32F-NEXT: beqz t1, .LBB52_9
-; RV64ZVE32F-NEXT: .LBB52_13: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s t1, v8
+; RV64ZVE32F-NEXT: beqz t1, .LBB52_11
+; RV64ZVE32F-NEXT: .LBB52_14: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s t1, v9
; RV64ZVE32F-NEXT: slli t1, t1, 3
; RV64ZVE32F-NEXT: add t1, a1, t1
; RV64ZVE32F-NEXT: ld t1, 0(t1)
-; RV64ZVE32F-NEXT: .LBB52_14: # %else14
+; RV64ZVE32F-NEXT: .LBB52_15: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi t2, a5, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz t2, .LBB52_17
-; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz t2, .LBB52_18
+; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s t2, v8
; RV64ZVE32F-NEXT: slli t2, t2, 3
; RV64ZVE32F-NEXT: add t2, a1, t2
; RV64ZVE32F-NEXT: ld t2, 0(t2)
; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: bnez a5, .LBB52_18
-; RV64ZVE32F-NEXT: .LBB52_16:
-; RV64ZVE32F-NEXT: ld a1, 56(a2)
-; RV64ZVE32F-NEXT: j .LBB52_19
+; RV64ZVE32F-NEXT: bnez a5, .LBB52_19
; RV64ZVE32F-NEXT: .LBB52_17:
+; RV64ZVE32F-NEXT: ld a1, 56(a2)
+; RV64ZVE32F-NEXT: j .LBB52_20
+; RV64ZVE32F-NEXT: .LBB52_18:
; RV64ZVE32F-NEXT: ld t2, 48(a2)
; RV64ZVE32F-NEXT: andi a5, a5, -128
-; RV64ZVE32F-NEXT: beqz a5, .LBB52_16
-; RV64ZVE32F-NEXT: .LBB52_18: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a5, .LBB52_17
+; RV64ZVE32F-NEXT: .LBB52_19: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: ld a1, 0(a1)
-; RV64ZVE32F-NEXT: .LBB52_19: # %else20
+; RV64ZVE32F-NEXT: .LBB52_20: # %else20
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: sd a4, 8(a0)
; RV64ZVE32F-NEXT: sd a6, 16(a0)
@@ -5796,86 +5794,90 @@ define <8 x i64> @mgather_baseidx_zext_v8i16_v8i64(ptr %base, <8 x i16> %idxs, <
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: ld a4, 0(a4)
; RV64ZVE32F-NEXT: .LBB53_5: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a7, a6, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: beqz a7, .LBB53_10
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a7, .LBB53_7
; RV64ZVE32F-NEXT: # %bb.6: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a7, v8
+; RV64ZVE32F-NEXT: vmv.x.s a7, v9
; RV64ZVE32F-NEXT: and a7, a7, a5
; RV64ZVE32F-NEXT: slli a7, a7, 3
; RV64ZVE32F-NEXT: add a7, a1, a7
; RV64ZVE32F-NEXT: ld a7, 0(a7)
-; RV64ZVE32F-NEXT: andi t0, a6, 8
-; RV64ZVE32F-NEXT: bnez t0, .LBB53_11
+; RV64ZVE32F-NEXT: j .LBB53_8
; RV64ZVE32F-NEXT: .LBB53_7:
-; RV64ZVE32F-NEXT: ld t0, 24(a2)
-; RV64ZVE32F-NEXT: andi t1, a6, 16
-; RV64ZVE32F-NEXT: bnez t1, .LBB53_12
-; RV64ZVE32F-NEXT: .LBB53_8:
-; RV64ZVE32F-NEXT: ld t1, 32(a2)
-; RV64ZVE32F-NEXT: andi t2, a6, 32
-; RV64ZVE32F-NEXT: bnez t2, .LBB53_13
-; RV64ZVE32F-NEXT: .LBB53_9:
-; RV64ZVE32F-NEXT: ld t2, 40(a2)
-; RV64ZVE32F-NEXT: j .LBB53_14
-; RV64ZVE32F-NEXT: .LBB53_10:
; RV64ZVE32F-NEXT: ld a7, 16(a2)
+; RV64ZVE32F-NEXT: .LBB53_8: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi t0, a6, 8
-; RV64ZVE32F-NEXT: beqz t0, .LBB53_7
-; RV64ZVE32F-NEXT: .LBB53_11: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s t0, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
+; RV64ZVE32F-NEXT: beqz t0, .LBB53_12
+; RV64ZVE32F-NEXT: # %bb.9: # %cond.load7
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s t0, v9
; RV64ZVE32F-NEXT: and t0, t0, a5
; RV64ZVE32F-NEXT: slli t0, t0, 3
; RV64ZVE32F-NEXT: add t0, a1, t0
; RV64ZVE32F-NEXT: ld t0, 0(t0)
; RV64ZVE32F-NEXT: andi t1, a6, 16
-; RV64ZVE32F-NEXT: beqz t1, .LBB53_8
-; RV64ZVE32F-NEXT: .LBB53_12: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s t1, v9
+; RV64ZVE32F-NEXT: bnez t1, .LBB53_13
+; RV64ZVE32F-NEXT: .LBB53_10:
+; RV64ZVE32F-NEXT: ld t1, 32(a2)
+; RV64ZVE32F-NEXT: andi t2, a6, 32
+; RV64ZVE32F-NEXT: bnez t2, .LBB53_14
+; RV64ZVE32F-NEXT: .LBB53_11:
+; RV64ZVE32F-NEXT: ld t2, 40(a2)
+; RV64ZVE32F-NEXT: j .LBB53_15
+; RV64ZVE32F-NEXT: .LBB53_12:
+; RV64ZVE32F-NEXT: ld t0, 24(a2)
+; RV64ZVE32F-NEXT: andi t1, a6, 16
+; RV64ZVE32F-NEXT: beqz t1, .LBB53_10
+; RV64ZVE32F-NEXT: .LBB53_13: # %cond.load10
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s t1, v8
; RV64ZVE32F-NEXT: and t1, t1, a5
; RV64ZVE32F-NEXT: slli t1, t1, 3
; RV64ZVE32F-NEXT: add t1, a1, t1
; RV64ZVE32F-NEXT: ld t1, 0(t1)
; RV64ZVE32F-NEXT: andi t2, a6, 32
-; RV64ZVE32F-NEXT: beqz t2, .LBB53_9
-; RV64ZVE32F-NEXT: .LBB53_13: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s t2, v8
+; RV64ZVE32F-NEXT: beqz t2, .LBB53_11
+; RV64ZVE32F-NEXT: .LBB53_14: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s t2, v9
; RV64ZVE32F-NEXT: and t2, t2, a5
; RV64ZVE32F-NEXT: slli t2, t2, 3
; RV64ZVE32F-NEXT: add t2, a1, t2
; RV64ZVE32F-NEXT: ld t2, 0(t2)
-; RV64ZVE32F-NEXT: .LBB53_14: # %else14
+; RV64ZVE32F-NEXT: .LBB53_15: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi t3, a6, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz t3, .LBB53_17
-; RV64ZVE32F-NEXT: # %bb.15: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz t3, .LBB53_18
+; RV64ZVE32F-NEXT: # %bb.16: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s t3, v8
; RV64ZVE32F-NEXT: and t3, t3, a5
; RV64ZVE32F-NEXT: slli t3, t3, 3
; RV64ZVE32F-NEXT: add t3, a1, t3
; RV64ZVE32F-NEXT: ld t3, 0(t3)
; RV64ZVE32F-NEXT: andi a6, a6, -128
-; RV64ZVE32F-NEXT: bnez a6, .LBB53_18
-; RV64ZVE32F-NEXT: .LBB53_16:
-; RV64ZVE32F-NEXT: ld a1, 56(a2)
-; RV64ZVE32F-NEXT: j .LBB53_19
+; RV64ZVE32F-NEXT: bnez a6, .LBB53_19
; RV64ZVE32F-NEXT: .LBB53_17:
+; RV64ZVE32F-NEXT: ld a1, 56(a2)
+; RV64ZVE32F-NEXT: j .LBB53_20
+; RV64ZVE32F-NEXT: .LBB53_18:
; RV64ZVE32F-NEXT: ld t3, 48(a2)
; RV64ZVE32F-NEXT: andi a6, a6, -128
-; RV64ZVE32F-NEXT: beqz a6, .LBB53_16
-; RV64ZVE32F-NEXT: .LBB53_18: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a6, .LBB53_17
+; RV64ZVE32F-NEXT: .LBB53_19: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: and a2, a2, a5
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: ld a1, 0(a1)
-; RV64ZVE32F-NEXT: .LBB53_19: # %else20
+; RV64ZVE32F-NEXT: .LBB53_20: # %else20
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: sd a4, 8(a0)
; RV64ZVE32F-NEXT: sd a7, 16(a0)
@@ -7401,76 +7403,74 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB64_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB64_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB64_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: flh fa5, 0(a2)
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
+; RV64ZVE32F-NEXT: .LBB64_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB64_13
-; RV64ZVE32F-NEXT: .LBB64_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB64_14
-; RV64ZVE32F-NEXT: .LBB64_7: # %else11
+; RV64ZVE32F-NEXT: .LBB64_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB64_9
-; RV64ZVE32F-NEXT: .LBB64_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB64_10
+; RV64ZVE32F-NEXT: .LBB64_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5
-; RV64ZVE32F-NEXT: .LBB64_9: # %else14
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5
+; RV64ZVE32F-NEXT: .LBB64_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB64_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB64_16
-; RV64ZVE32F-NEXT: .LBB64_11: # %else20
+; RV64ZVE32F-NEXT: .LBB64_12: # %else20
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB64_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: flh fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB64_6
; RV64ZVE32F-NEXT: .LBB64_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB64_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB64_8
; RV64ZVE32F-NEXT: .LBB64_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB64_8
-; RV64ZVE32F-NEXT: j .LBB64_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB64_9
+; RV64ZVE32F-NEXT: j .LBB64_10
; RV64ZVE32F-NEXT: .LBB64_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -7480,7 +7480,7 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB64_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB64_12
; RV64ZVE32F-NEXT: .LBB64_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -7546,76 +7546,74 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB65_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB65_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB65_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: flh fa5, 0(a2)
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
+; RV64ZVE32F-NEXT: .LBB65_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB65_13
-; RV64ZVE32F-NEXT: .LBB65_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB65_14
-; RV64ZVE32F-NEXT: .LBB65_7: # %else11
+; RV64ZVE32F-NEXT: .LBB65_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB65_9
-; RV64ZVE32F-NEXT: .LBB65_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB65_10
+; RV64ZVE32F-NEXT: .LBB65_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5
-; RV64ZVE32F-NEXT: .LBB65_9: # %else14
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5
+; RV64ZVE32F-NEXT: .LBB65_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB65_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB65_16
-; RV64ZVE32F-NEXT: .LBB65_11: # %else20
+; RV64ZVE32F-NEXT: .LBB65_12: # %else20
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB65_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: flh fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB65_6
; RV64ZVE32F-NEXT: .LBB65_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB65_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB65_8
; RV64ZVE32F-NEXT: .LBB65_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB65_8
-; RV64ZVE32F-NEXT: j .LBB65_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB65_9
+; RV64ZVE32F-NEXT: j .LBB65_10
; RV64ZVE32F-NEXT: .LBB65_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -7625,7 +7623,7 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB65_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB65_12
; RV64ZVE32F-NEXT: .LBB65_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -7694,80 +7692,78 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB66_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB66_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB66_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: andi a2, a2, 255
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: flh fa5, 0(a2)
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
+; RV64ZVE32F-NEXT: .LBB66_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB66_13
-; RV64ZVE32F-NEXT: .LBB66_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB66_14
-; RV64ZVE32F-NEXT: .LBB66_7: # %else11
+; RV64ZVE32F-NEXT: .LBB66_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB66_9
-; RV64ZVE32F-NEXT: .LBB66_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB66_10
+; RV64ZVE32F-NEXT: .LBB66_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5
-; RV64ZVE32F-NEXT: .LBB66_9: # %else14
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5
+; RV64ZVE32F-NEXT: .LBB66_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB66_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB66_16
-; RV64ZVE32F-NEXT: .LBB66_11: # %else20
+; RV64ZVE32F-NEXT: .LBB66_12: # %else20
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB66_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: andi a2, a2, 255
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: flh fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB66_6
; RV64ZVE32F-NEXT: .LBB66_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB66_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB66_8
; RV64ZVE32F-NEXT: .LBB66_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB66_8
-; RV64ZVE32F-NEXT: j .LBB66_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB66_9
+; RV64ZVE32F-NEXT: j .LBB66_10
; RV64ZVE32F-NEXT: .LBB66_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
@@ -7778,7 +7774,7 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB66_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB66_12
; RV64ZVE32F-NEXT: .LBB66_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -7845,75 +7841,73 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB67_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB67_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB67_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: flh fa5, 0(a2)
+; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
+; RV64ZVE32F-NEXT: .LBB67_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB67_13
-; RV64ZVE32F-NEXT: .LBB67_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB67_14
-; RV64ZVE32F-NEXT: .LBB67_7: # %else11
+; RV64ZVE32F-NEXT: .LBB67_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB67_9
-; RV64ZVE32F-NEXT: .LBB67_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB67_10
+; RV64ZVE32F-NEXT: .LBB67_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vsetivli zero, 6, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 5
-; RV64ZVE32F-NEXT: .LBB67_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 5
+; RV64ZVE32F-NEXT: .LBB67_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB67_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB67_16
-; RV64ZVE32F-NEXT: .LBB67_11: # %else20
+; RV64ZVE32F-NEXT: .LBB67_12: # %else20
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB67_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: flh fa5, 0(a2)
-; RV64ZVE32F-NEXT: vfmv.s.f v11, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB67_6
; RV64ZVE32F-NEXT: .LBB67_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 3
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB67_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB67_8
; RV64ZVE32F-NEXT: .LBB67_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 5, e16, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flh fa5, 0(a2)
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB67_8
-; RV64ZVE32F-NEXT: j .LBB67_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB67_9
+; RV64ZVE32F-NEXT: j .LBB67_10
; RV64ZVE32F-NEXT: .LBB67_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -7923,7 +7917,7 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 7, e16, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB67_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB67_12
; RV64ZVE32F-NEXT: .LBB67_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -8343,58 +8337,56 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB74_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB74_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB74_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: flw fa5, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB74_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB74_13
-; RV64ZVE32F-NEXT: .LBB74_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB74_14
-; RV64ZVE32F-NEXT: .LBB74_7: # %else11
+; RV64ZVE32F-NEXT: .LBB74_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB74_9
-; RV64ZVE32F-NEXT: .LBB74_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB74_10
+; RV64ZVE32F-NEXT: .LBB74_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB74_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB74_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB74_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB74_16
-; RV64ZVE32F-NEXT: .LBB74_11: # %else20
+; RV64ZVE32F-NEXT: .LBB74_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB74_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: flw fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB74_6
; RV64ZVE32F-NEXT: .LBB74_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
@@ -8403,19 +8395,19 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB74_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB74_8
; RV64ZVE32F-NEXT: .LBB74_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB74_8
-; RV64ZVE32F-NEXT: j .LBB74_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB74_9
+; RV64ZVE32F-NEXT: j .LBB74_10
; RV64ZVE32F-NEXT: .LBB74_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -8426,7 +8418,7 @@ define <8 x float> @mgather_baseidx_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <8 x i
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB74_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB74_12
; RV64ZVE32F-NEXT: .LBB74_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -8493,58 +8485,56 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB75_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB75_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB75_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: flw fa5, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB75_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB75_13
-; RV64ZVE32F-NEXT: .LBB75_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB75_14
-; RV64ZVE32F-NEXT: .LBB75_7: # %else11
+; RV64ZVE32F-NEXT: .LBB75_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB75_9
-; RV64ZVE32F-NEXT: .LBB75_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB75_10
+; RV64ZVE32F-NEXT: .LBB75_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB75_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB75_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB75_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB75_16
-; RV64ZVE32F-NEXT: .LBB75_11: # %else20
+; RV64ZVE32F-NEXT: .LBB75_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB75_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: flw fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB75_6
; RV64ZVE32F-NEXT: .LBB75_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
@@ -8553,19 +8543,19 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB75_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB75_8
; RV64ZVE32F-NEXT: .LBB75_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB75_8
-; RV64ZVE32F-NEXT: j .LBB75_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB75_9
+; RV64ZVE32F-NEXT: j .LBB75_10
; RV64ZVE32F-NEXT: .LBB75_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -8576,7 +8566,7 @@ define <8 x float> @mgather_baseidx_sext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB75_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB75_12
; RV64ZVE32F-NEXT: .LBB75_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -8646,83 +8636,81 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB76_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB76_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB76_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: andi a2, a2, 255
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: flw fa5, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB76_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB76_13
-; RV64ZVE32F-NEXT: .LBB76_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB76_14
-; RV64ZVE32F-NEXT: .LBB76_7: # %else11
+; RV64ZVE32F-NEXT: .LBB76_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB76_9
-; RV64ZVE32F-NEXT: .LBB76_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB76_10
+; RV64ZVE32F-NEXT: .LBB76_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB76_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB76_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB76_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB76_16
-; RV64ZVE32F-NEXT: .LBB76_11: # %else20
+; RV64ZVE32F-NEXT: .LBB76_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB76_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: andi a2, a2, 255
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: flw fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB76_6
; RV64ZVE32F-NEXT: .LBB76_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB76_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB76_8
; RV64ZVE32F-NEXT: .LBB76_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB76_8
-; RV64ZVE32F-NEXT: j .LBB76_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB76_9
+; RV64ZVE32F-NEXT: j .LBB76_10
; RV64ZVE32F-NEXT: .LBB76_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
@@ -8734,7 +8722,7 @@ define <8 x float> @mgather_baseidx_zext_v8i8_v8f32(ptr %base, <8 x i8> %idxs, <
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB76_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB76_12
; RV64ZVE32F-NEXT: .LBB76_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -8804,58 +8792,56 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB77_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB77_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB77_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: flw fa5, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB77_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB77_13
-; RV64ZVE32F-NEXT: .LBB77_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB77_14
-; RV64ZVE32F-NEXT: .LBB77_7: # %else11
+; RV64ZVE32F-NEXT: .LBB77_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB77_9
-; RV64ZVE32F-NEXT: .LBB77_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB77_10
+; RV64ZVE32F-NEXT: .LBB77_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB77_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB77_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB77_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB77_16
-; RV64ZVE32F-NEXT: .LBB77_11: # %else20
+; RV64ZVE32F-NEXT: .LBB77_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB77_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: flw fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB77_6
; RV64ZVE32F-NEXT: .LBB77_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
@@ -8864,19 +8850,19 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB77_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB77_8
; RV64ZVE32F-NEXT: .LBB77_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB77_8
-; RV64ZVE32F-NEXT: j .LBB77_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB77_9
+; RV64ZVE32F-NEXT: j .LBB77_10
; RV64ZVE32F-NEXT: .LBB77_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -8887,7 +8873,7 @@ define <8 x float> @mgather_baseidx_v8i16_v8f32(ptr %base, <8 x i16> %idxs, <8 x
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB77_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB77_12
; RV64ZVE32F-NEXT: .LBB77_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -8952,61 +8938,59 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
-; RV64ZVE32F-NEXT: .LBB78_4: # %else2
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
+; RV64ZVE32F-NEXT: .LBB78_4: # %else2
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: andi a2, a1, 4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB78_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: flw fa5, 0(a2)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB78_6: # %else5
; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB78_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB78_13
-; RV64ZVE32F-NEXT: .LBB78_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB78_14
-; RV64ZVE32F-NEXT: .LBB78_7: # %else11
+; RV64ZVE32F-NEXT: .LBB78_8: # %else11
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB78_9
-; RV64ZVE32F-NEXT: .LBB78_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a2, .LBB78_10
+; RV64ZVE32F-NEXT: .LBB78_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB78_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB78_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB78_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB78_16
-; RV64ZVE32F-NEXT: .LBB78_11: # %else20
+; RV64ZVE32F-NEXT: .LBB78_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB78_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: flw fa5, 0(a2)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB78_6
; RV64ZVE32F-NEXT: .LBB78_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
@@ -9015,19 +8999,19 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB78_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB78_8
; RV64ZVE32F-NEXT: .LBB78_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: flw fa5, 0(a2)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB78_8
-; RV64ZVE32F-NEXT: j .LBB78_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB78_9
+; RV64ZVE32F-NEXT: j .LBB78_10
; RV64ZVE32F-NEXT: .LBB78_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -9038,7 +9022,7 @@ define <8 x float> @mgather_baseidx_sext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB78_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB78_12
; RV64ZVE32F-NEXT: .LBB78_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -9111,60 +9095,58 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: vsetivli zero, 2, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB79_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB79_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB79_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: and a3, a3, a1
+; RV64ZVE32F-NEXT: slli a3, a3, 2
+; RV64ZVE32F-NEXT: add a3, a0, a3
+; RV64ZVE32F-NEXT: flw fa5, 0(a3)
+; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
+; RV64ZVE32F-NEXT: .LBB79_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB79_13
-; RV64ZVE32F-NEXT: .LBB79_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB79_14
-; RV64ZVE32F-NEXT: .LBB79_7: # %else11
+; RV64ZVE32F-NEXT: .LBB79_8: # %else11
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB79_9
-; RV64ZVE32F-NEXT: .LBB79_8: # %cond.load13
+; RV64ZVE32F-NEXT: beqz a3, .LBB79_10
+; RV64ZVE32F-NEXT: .LBB79_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: flw fa5, 0(a3)
; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
; RV64ZVE32F-NEXT: vsetivli zero, 6, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 5
-; RV64ZVE32F-NEXT: .LBB79_9: # %else14
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 5
+; RV64ZVE32F-NEXT: .LBB79_10: # %else14
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a3, .LBB79_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else17
+; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a2, a2, -128
; RV64ZVE32F-NEXT: bnez a2, .LBB79_16
-; RV64ZVE32F-NEXT: .LBB79_11: # %else20
+; RV64ZVE32F-NEXT: .LBB79_12: # %else20
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB79_12: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 2
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: flw fa5, 0(a3)
-; RV64ZVE32F-NEXT: vsetvli zero, zero, e32, m1, ta, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 2
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB79_6
; RV64ZVE32F-NEXT: .LBB79_13: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
@@ -9174,20 +9156,20 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: vsetivli zero, 4, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 3
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB79_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB79_8
; RV64ZVE32F-NEXT: .LBB79_14: # %cond.load10
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: flw fa5, 0(a3)
; RV64ZVE32F-NEXT: vsetivli zero, 5, e32, m2, tu, ma
-; RV64ZVE32F-NEXT: vfmv.s.f v8, fa5
-; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vfmv.s.f v12, fa5
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 4
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB79_8
-; RV64ZVE32F-NEXT: j .LBB79_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB79_9
+; RV64ZVE32F-NEXT: j .LBB79_10
; RV64ZVE32F-NEXT: .LBB79_15: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: and a3, a3, a1
@@ -9199,7 +9181,7 @@ define <8 x float> @mgather_baseidx_zext_v8i16_v8f32(ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: vsetivli zero, 7, e32, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 6
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB79_11
+; RV64ZVE32F-NEXT: beqz a2, .LBB79_12
; RV64ZVE32F-NEXT: .LBB79_16: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
@@ -10042,46 +10024,53 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa1, 0(a3)
; RV64ZVE32F-NEXT: .LBB87_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB87_14
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB87_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: slli a3, a3, 3
+; RV64ZVE32F-NEXT: add a3, a1, a3
+; RV64ZVE32F-NEXT: fld fa2, 0(a3)
+; RV64ZVE32F-NEXT: .LBB87_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB87_15
-; RV64ZVE32F-NEXT: .LBB87_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB87_16
-; RV64ZVE32F-NEXT: .LBB87_7: # %else11
+; RV64ZVE32F-NEXT: .LBB87_8: # %else11
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB87_9
-; RV64ZVE32F-NEXT: .LBB87_8: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: beqz a3, .LBB87_10
+; RV64ZVE32F-NEXT: .LBB87_9: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa5, 0(a3)
-; RV64ZVE32F-NEXT: .LBB87_9: # %else14
+; RV64ZVE32F-NEXT: .LBB87_10: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz a3, .LBB87_11
-; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB87_12
+; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa6, 0(a3)
-; RV64ZVE32F-NEXT: .LBB87_11: # %else17
+; RV64ZVE32F-NEXT: .LBB87_12: # %else17
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB87_13
-; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a2, .LBB87_14
+; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: fld fa7, 0(a1)
-; RV64ZVE32F-NEXT: .LBB87_13: # %else20
+; RV64ZVE32F-NEXT: .LBB87_14: # %else20
; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
; RV64ZVE32F-NEXT: fsd fa1, 8(a0)
; RV64ZVE32F-NEXT: fsd fa2, 16(a0)
@@ -10091,29 +10080,24 @@ define <8 x double> @mgather_baseidx_v8i8_v8f64(ptr %base, <8 x i8> %idxs, <8 x
; RV64ZVE32F-NEXT: fsd fa6, 48(a0)
; RV64ZVE32F-NEXT: fsd fa7, 56(a0)
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB87_14: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: slli a3, a3, 3
-; RV64ZVE32F-NEXT: add a3, a1, a3
-; RV64ZVE32F-NEXT: fld fa2, 0(a3)
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB87_6
; RV64ZVE32F-NEXT: .LBB87_15: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa3, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB87_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB87_8
; RV64ZVE32F-NEXT: .LBB87_16: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa4, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB87_8
-; RV64ZVE32F-NEXT: j .LBB87_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB87_9
+; RV64ZVE32F-NEXT: j .LBB87_10
%ptrs = getelementptr inbounds double, ptr %base, <8 x i8> %idxs
%v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
ret <8 x double> %v
@@ -10257,46 +10241,53 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa1, 0(a3)
; RV64ZVE32F-NEXT: .LBB88_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB88_14
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB88_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: slli a3, a3, 3
+; RV64ZVE32F-NEXT: add a3, a1, a3
+; RV64ZVE32F-NEXT: fld fa2, 0(a3)
+; RV64ZVE32F-NEXT: .LBB88_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB88_15
-; RV64ZVE32F-NEXT: .LBB88_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB88_16
-; RV64ZVE32F-NEXT: .LBB88_7: # %else11
+; RV64ZVE32F-NEXT: .LBB88_8: # %else11
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB88_9
-; RV64ZVE32F-NEXT: .LBB88_8: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: beqz a3, .LBB88_10
+; RV64ZVE32F-NEXT: .LBB88_9: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa5, 0(a3)
-; RV64ZVE32F-NEXT: .LBB88_9: # %else14
+; RV64ZVE32F-NEXT: .LBB88_10: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz a3, .LBB88_11
-; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB88_12
+; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa6, 0(a3)
-; RV64ZVE32F-NEXT: .LBB88_11: # %else17
+; RV64ZVE32F-NEXT: .LBB88_12: # %else17
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB88_13
-; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a2, .LBB88_14
+; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: fld fa7, 0(a1)
-; RV64ZVE32F-NEXT: .LBB88_13: # %else20
+; RV64ZVE32F-NEXT: .LBB88_14: # %else20
; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
; RV64ZVE32F-NEXT: fsd fa1, 8(a0)
; RV64ZVE32F-NEXT: fsd fa2, 16(a0)
@@ -10306,29 +10297,24 @@ define <8 x double> @mgather_baseidx_sext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
; RV64ZVE32F-NEXT: fsd fa6, 48(a0)
; RV64ZVE32F-NEXT: fsd fa7, 56(a0)
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB88_14: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: slli a3, a3, 3
-; RV64ZVE32F-NEXT: add a3, a1, a3
-; RV64ZVE32F-NEXT: fld fa2, 0(a3)
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB88_6
; RV64ZVE32F-NEXT: .LBB88_15: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa3, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB88_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB88_8
; RV64ZVE32F-NEXT: .LBB88_16: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa4, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB88_8
-; RV64ZVE32F-NEXT: j .LBB88_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB88_9
+; RV64ZVE32F-NEXT: j .LBB88_10
%eidxs = sext <8 x i8> %idxs to <8 x i64>
%ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
%v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
@@ -10475,49 +10461,57 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa1, 0(a3)
; RV64ZVE32F-NEXT: .LBB89_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB89_14
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB89_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: andi a3, a3, 255
+; RV64ZVE32F-NEXT: slli a3, a3, 3
+; RV64ZVE32F-NEXT: add a3, a1, a3
+; RV64ZVE32F-NEXT: fld fa2, 0(a3)
+; RV64ZVE32F-NEXT: .LBB89_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB89_15
-; RV64ZVE32F-NEXT: .LBB89_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB89_16
-; RV64ZVE32F-NEXT: .LBB89_7: # %else11
+; RV64ZVE32F-NEXT: .LBB89_8: # %else11
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB89_9
-; RV64ZVE32F-NEXT: .LBB89_8: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: beqz a3, .LBB89_10
+; RV64ZVE32F-NEXT: .LBB89_9: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: andi a3, a3, 255
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa5, 0(a3)
-; RV64ZVE32F-NEXT: .LBB89_9: # %else14
+; RV64ZVE32F-NEXT: .LBB89_10: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz a3, .LBB89_11
-; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB89_12
+; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: andi a3, a3, 255
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa6, 0(a3)
-; RV64ZVE32F-NEXT: .LBB89_11: # %else17
+; RV64ZVE32F-NEXT: .LBB89_12: # %else17
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB89_13
-; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a2, .LBB89_14
+; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: fld fa7, 0(a1)
-; RV64ZVE32F-NEXT: .LBB89_13: # %else20
+; RV64ZVE32F-NEXT: .LBB89_14: # %else20
; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
; RV64ZVE32F-NEXT: fsd fa1, 8(a0)
; RV64ZVE32F-NEXT: fsd fa2, 16(a0)
@@ -10527,32 +10521,26 @@ define <8 x double> @mgather_baseidx_zext_v8i8_v8f64(ptr %base, <8 x i8> %idxs,
; RV64ZVE32F-NEXT: fsd fa6, 48(a0)
; RV64ZVE32F-NEXT: fsd fa7, 56(a0)
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB89_14: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: andi a3, a3, 255
-; RV64ZVE32F-NEXT: slli a3, a3, 3
-; RV64ZVE32F-NEXT: add a3, a1, a3
-; RV64ZVE32F-NEXT: fld fa2, 0(a3)
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB89_6
; RV64ZVE32F-NEXT: .LBB89_15: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: andi a3, a3, 255
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa3, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB89_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB89_8
; RV64ZVE32F-NEXT: .LBB89_16: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: andi a3, a3, 255
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa4, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB89_8
-; RV64ZVE32F-NEXT: j .LBB89_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB89_9
+; RV64ZVE32F-NEXT: j .LBB89_10
%eidxs = zext <8 x i8> %idxs to <8 x i64>
%ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
%v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
@@ -10698,46 +10686,53 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa1, 0(a3)
; RV64ZVE32F-NEXT: .LBB90_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB90_14
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB90_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: slli a3, a3, 3
+; RV64ZVE32F-NEXT: add a3, a1, a3
+; RV64ZVE32F-NEXT: fld fa2, 0(a3)
+; RV64ZVE32F-NEXT: .LBB90_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB90_15
-; RV64ZVE32F-NEXT: .LBB90_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB90_16
-; RV64ZVE32F-NEXT: .LBB90_7: # %else11
+; RV64ZVE32F-NEXT: .LBB90_8: # %else11
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB90_9
-; RV64ZVE32F-NEXT: .LBB90_8: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: beqz a3, .LBB90_10
+; RV64ZVE32F-NEXT: .LBB90_9: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa5, 0(a3)
-; RV64ZVE32F-NEXT: .LBB90_9: # %else14
+; RV64ZVE32F-NEXT: .LBB90_10: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz a3, .LBB90_11
-; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB90_12
+; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa6, 0(a3)
-; RV64ZVE32F-NEXT: .LBB90_11: # %else17
+; RV64ZVE32F-NEXT: .LBB90_12: # %else17
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB90_13
-; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a2, .LBB90_14
+; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: fld fa7, 0(a1)
-; RV64ZVE32F-NEXT: .LBB90_13: # %else20
+; RV64ZVE32F-NEXT: .LBB90_14: # %else20
; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
; RV64ZVE32F-NEXT: fsd fa1, 8(a0)
; RV64ZVE32F-NEXT: fsd fa2, 16(a0)
@@ -10747,29 +10742,24 @@ define <8 x double> @mgather_baseidx_v8i16_v8f64(ptr %base, <8 x i16> %idxs, <8
; RV64ZVE32F-NEXT: fsd fa6, 48(a0)
; RV64ZVE32F-NEXT: fsd fa7, 56(a0)
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB90_14: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: slli a3, a3, 3
-; RV64ZVE32F-NEXT: add a3, a1, a3
-; RV64ZVE32F-NEXT: fld fa2, 0(a3)
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB90_6
; RV64ZVE32F-NEXT: .LBB90_15: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa3, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB90_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB90_8
; RV64ZVE32F-NEXT: .LBB90_16: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa4, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB90_8
-; RV64ZVE32F-NEXT: j .LBB90_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB90_9
+; RV64ZVE32F-NEXT: j .LBB90_10
%ptrs = getelementptr inbounds double, ptr %base, <8 x i16> %idxs
%v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
ret <8 x double> %v
@@ -10914,46 +10904,53 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa1, 0(a3)
; RV64ZVE32F-NEXT: .LBB91_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB91_14
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB91_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: slli a3, a3, 3
+; RV64ZVE32F-NEXT: add a3, a1, a3
+; RV64ZVE32F-NEXT: fld fa2, 0(a3)
+; RV64ZVE32F-NEXT: .LBB91_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB91_15
-; RV64ZVE32F-NEXT: .LBB91_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB91_16
-; RV64ZVE32F-NEXT: .LBB91_7: # %else11
+; RV64ZVE32F-NEXT: .LBB91_8: # %else11
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB91_9
-; RV64ZVE32F-NEXT: .LBB91_8: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: beqz a3, .LBB91_10
+; RV64ZVE32F-NEXT: .LBB91_9: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa5, 0(a3)
-; RV64ZVE32F-NEXT: .LBB91_9: # %else14
+; RV64ZVE32F-NEXT: .LBB91_10: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz a3, .LBB91_11
-; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB91_12
+; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa6, 0(a3)
-; RV64ZVE32F-NEXT: .LBB91_11: # %else17
+; RV64ZVE32F-NEXT: .LBB91_12: # %else17
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB91_13
-; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a2, .LBB91_14
+; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: fld fa7, 0(a1)
-; RV64ZVE32F-NEXT: .LBB91_13: # %else20
+; RV64ZVE32F-NEXT: .LBB91_14: # %else20
; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
; RV64ZVE32F-NEXT: fsd fa1, 8(a0)
; RV64ZVE32F-NEXT: fsd fa2, 16(a0)
@@ -10963,29 +10960,24 @@ define <8 x double> @mgather_baseidx_sext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
; RV64ZVE32F-NEXT: fsd fa6, 48(a0)
; RV64ZVE32F-NEXT: fsd fa7, 56(a0)
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB91_14: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: slli a3, a3, 3
-; RV64ZVE32F-NEXT: add a3, a1, a3
-; RV64ZVE32F-NEXT: fld fa2, 0(a3)
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB91_6
; RV64ZVE32F-NEXT: .LBB91_15: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa3, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB91_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB91_8
; RV64ZVE32F-NEXT: .LBB91_16: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a1, a3
; RV64ZVE32F-NEXT: fld fa4, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB91_8
-; RV64ZVE32F-NEXT: j .LBB91_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB91_9
+; RV64ZVE32F-NEXT: j .LBB91_10
%eidxs = sext <8 x i16> %idxs to <8 x i64>
%ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
%v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
@@ -11135,49 +11127,57 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: fld fa1, 0(a4)
; RV64ZVE32F-NEXT: .LBB92_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a4, a3, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a4, .LBB92_14
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a4, .LBB92_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a4, v9
+; RV64ZVE32F-NEXT: and a4, a4, a2
+; RV64ZVE32F-NEXT: slli a4, a4, 3
+; RV64ZVE32F-NEXT: add a4, a1, a4
+; RV64ZVE32F-NEXT: fld fa2, 0(a4)
+; RV64ZVE32F-NEXT: .LBB92_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a4, a3, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a4, .LBB92_15
-; RV64ZVE32F-NEXT: .LBB92_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a4, a3, 16
; RV64ZVE32F-NEXT: bnez a4, .LBB92_16
-; RV64ZVE32F-NEXT: .LBB92_7: # %else11
+; RV64ZVE32F-NEXT: .LBB92_8: # %else11
; RV64ZVE32F-NEXT: andi a4, a3, 32
-; RV64ZVE32F-NEXT: beqz a4, .LBB92_9
-; RV64ZVE32F-NEXT: .LBB92_8: # %cond.load13
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a4, v8
+; RV64ZVE32F-NEXT: beqz a4, .LBB92_10
+; RV64ZVE32F-NEXT: .LBB92_9: # %cond.load13
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a4, v9
; RV64ZVE32F-NEXT: and a4, a4, a2
; RV64ZVE32F-NEXT: slli a4, a4, 3
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: fld fa5, 0(a4)
-; RV64ZVE32F-NEXT: .LBB92_9: # %else14
+; RV64ZVE32F-NEXT: .LBB92_10: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a4, a3, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: beqz a4, .LBB92_11
-; RV64ZVE32F-NEXT: # %bb.10: # %cond.load16
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: beqz a4, .LBB92_12
+; RV64ZVE32F-NEXT: # %bb.11: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a4, v8
; RV64ZVE32F-NEXT: and a4, a4, a2
; RV64ZVE32F-NEXT: slli a4, a4, 3
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: fld fa6, 0(a4)
-; RV64ZVE32F-NEXT: .LBB92_11: # %else17
+; RV64ZVE32F-NEXT: .LBB92_12: # %else17
; RV64ZVE32F-NEXT: andi a3, a3, -128
-; RV64ZVE32F-NEXT: beqz a3, .LBB92_13
-; RV64ZVE32F-NEXT: # %bb.12: # %cond.load19
+; RV64ZVE32F-NEXT: beqz a3, .LBB92_14
+; RV64ZVE32F-NEXT: # %bb.13: # %cond.load19
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: and a2, a3, a2
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a1, a1, a2
; RV64ZVE32F-NEXT: fld fa7, 0(a1)
-; RV64ZVE32F-NEXT: .LBB92_13: # %else20
+; RV64ZVE32F-NEXT: .LBB92_14: # %else20
; RV64ZVE32F-NEXT: fsd fa0, 0(a0)
; RV64ZVE32F-NEXT: fsd fa1, 8(a0)
; RV64ZVE32F-NEXT: fsd fa2, 16(a0)
@@ -11187,32 +11187,26 @@ define <8 x double> @mgather_baseidx_zext_v8i16_v8f64(ptr %base, <8 x i16> %idxs
; RV64ZVE32F-NEXT: fsd fa6, 48(a0)
; RV64ZVE32F-NEXT: fsd fa7, 56(a0)
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB92_14: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a4, v8
-; RV64ZVE32F-NEXT: and a4, a4, a2
-; RV64ZVE32F-NEXT: slli a4, a4, 3
-; RV64ZVE32F-NEXT: add a4, a1, a4
-; RV64ZVE32F-NEXT: fld fa2, 0(a4)
-; RV64ZVE32F-NEXT: andi a4, a3, 8
-; RV64ZVE32F-NEXT: beqz a4, .LBB92_6
; RV64ZVE32F-NEXT: .LBB92_15: # %cond.load7
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a4, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a4, v9
; RV64ZVE32F-NEXT: and a4, a4, a2
; RV64ZVE32F-NEXT: slli a4, a4, 3
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: fld fa3, 0(a4)
; RV64ZVE32F-NEXT: andi a4, a3, 16
-; RV64ZVE32F-NEXT: beqz a4, .LBB92_7
+; RV64ZVE32F-NEXT: beqz a4, .LBB92_8
; RV64ZVE32F-NEXT: .LBB92_16: # %cond.load10
-; RV64ZVE32F-NEXT: vmv.x.s a4, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a4, v8
; RV64ZVE32F-NEXT: and a4, a4, a2
; RV64ZVE32F-NEXT: slli a4, a4, 3
; RV64ZVE32F-NEXT: add a4, a1, a4
; RV64ZVE32F-NEXT: fld fa4, 0(a4)
; RV64ZVE32F-NEXT: andi a4, a3, 32
-; RV64ZVE32F-NEXT: bnez a4, .LBB92_8
-; RV64ZVE32F-NEXT: j .LBB92_9
+; RV64ZVE32F-NEXT: bnez a4, .LBB92_9
+; RV64ZVE32F-NEXT: j .LBB92_10
%eidxs = zext <8 x i16> %idxs to <8 x i64>
%ptrs = getelementptr inbounds double, ptr %base, <8 x i64> %eidxs
%v = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> %ptrs, i32 8, <8 x i1> %m, <8 x double> %passthru)
@@ -12145,31 +12139,29 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 1
; RV64ZVE32F-NEXT: .LBB97_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB97_25
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
+; RV64ZVE32F-NEXT: beqz a2, .LBB97_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 2
+; RV64ZVE32F-NEXT: .LBB97_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB97_26
-; RV64ZVE32F-NEXT: .LBB97_6: # %else8
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB97_8
-; RV64ZVE32F-NEXT: .LBB97_7: # %cond.load10
-; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v11, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 4
+; RV64ZVE32F-NEXT: bnez a2, .LBB97_27
; RV64ZVE32F-NEXT: .LBB97_8: # %else11
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB97_10
-; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13
+; RV64ZVE32F-NEXT: .LBB97_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v11
@@ -12179,16 +12171,18 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 5
; RV64ZVE32F-NEXT: .LBB97_10: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB97_27
+; RV64ZVE32F-NEXT: bnez a2, .LBB97_28
; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a2, a1, 128
-; RV64ZVE32F-NEXT: bnez a2, .LBB97_28
+; RV64ZVE32F-NEXT: bnez a2, .LBB97_29
; RV64ZVE32F-NEXT: .LBB97_12: # %else20
; RV64ZVE32F-NEXT: andi a2, a1, 256
-; RV64ZVE32F-NEXT: bnez a2, .LBB97_29
+; RV64ZVE32F-NEXT: bnez a2, .LBB97_30
; RV64ZVE32F-NEXT: .LBB97_13: # %else23
; RV64ZVE32F-NEXT: andi a2, a1, 512
; RV64ZVE32F-NEXT: beqz a2, .LBB97_15
@@ -12202,47 +12196,54 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 9
; RV64ZVE32F-NEXT: .LBB97_15: # %else26
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 1024
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB97_30
-; RV64ZVE32F-NEXT: # %bb.16: # %else29
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB97_17
+; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: vmv.s.x v11, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 10
+; RV64ZVE32F-NEXT: .LBB97_17: # %else29
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 52
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bltz a2, .LBB97_31
-; RV64ZVE32F-NEXT: .LBB97_17: # %else32
+; RV64ZVE32F-NEXT: # %bb.18: # %else32
; RV64ZVE32F-NEXT: slli a2, a1, 51
; RV64ZVE32F-NEXT: bltz a2, .LBB97_32
-; RV64ZVE32F-NEXT: .LBB97_18: # %else35
+; RV64ZVE32F-NEXT: .LBB97_19: # %else35
; RV64ZVE32F-NEXT: slli a2, a1, 50
-; RV64ZVE32F-NEXT: bgez a2, .LBB97_20
-; RV64ZVE32F-NEXT: .LBB97_19: # %cond.load37
+; RV64ZVE32F-NEXT: bgez a2, .LBB97_21
+; RV64ZVE32F-NEXT: .LBB97_20: # %cond.load37
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vsetivli zero, 14, e8, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 13
-; RV64ZVE32F-NEXT: .LBB97_20: # %else38
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 13
+; RV64ZVE32F-NEXT: .LBB97_21: # %else38
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 49
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v10, 2
-; RV64ZVE32F-NEXT: bgez a2, .LBB97_22
-; RV64ZVE32F-NEXT: # %bb.21: # %cond.load40
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: bgez a2, .LBB97_23
+; RV64ZVE32F-NEXT: # %bb.22: # %cond.load40
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vsetivli zero, 15, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 14
-; RV64ZVE32F-NEXT: .LBB97_22: # %else41
+; RV64ZVE32F-NEXT: .LBB97_23: # %else41
; RV64ZVE32F-NEXT: lui a2, 1048568
; RV64ZVE32F-NEXT: and a1, a1, a2
-; RV64ZVE32F-NEXT: beqz a1, .LBB97_24
-; RV64ZVE32F-NEXT: # %bb.23: # %cond.load43
+; RV64ZVE32F-NEXT: beqz a1, .LBB97_25
+; RV64ZVE32F-NEXT: # %bb.24: # %cond.load43
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
@@ -12251,18 +12252,9 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
; RV64ZVE32F-NEXT: vmv.s.x v8, a0
; RV64ZVE32F-NEXT: vsetivli zero, 16, e8, m1, ta, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 15
-; RV64ZVE32F-NEXT: .LBB97_24: # %else44
+; RV64ZVE32F-NEXT: .LBB97_25: # %else44
; RV64ZVE32F-NEXT: vmv1r.v v8, v9
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB97_25: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v12, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB97_6
; RV64ZVE32F-NEXT: .LBB97_26: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
@@ -12273,9 +12265,18 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: bnez a2, .LBB97_7
-; RV64ZVE32F-NEXT: j .LBB97_8
-; RV64ZVE32F-NEXT: .LBB97_27: # %cond.load16
+; RV64ZVE32F-NEXT: beqz a2, .LBB97_8
+; RV64ZVE32F-NEXT: .LBB97_27: # %cond.load10
+; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m1, tu, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: vmv.s.x v11, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 4
+; RV64ZVE32F-NEXT: andi a2, a1, 32
+; RV64ZVE32F-NEXT: bnez a2, .LBB97_9
+; RV64ZVE32F-NEXT: j .LBB97_10
+; RV64ZVE32F-NEXT: .LBB97_28: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
@@ -12284,7 +12285,7 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 6
; RV64ZVE32F-NEXT: andi a2, a1, 128
; RV64ZVE32F-NEXT: beqz a2, .LBB97_12
-; RV64ZVE32F-NEXT: .LBB97_28: # %cond.load19
+; RV64ZVE32F-NEXT: .LBB97_29: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
@@ -12295,7 +12296,7 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 7
; RV64ZVE32F-NEXT: andi a2, a1, 256
; RV64ZVE32F-NEXT: beqz a2, .LBB97_13
-; RV64ZVE32F-NEXT: .LBB97_29: # %cond.load22
+; RV64ZVE32F-NEXT: .LBB97_30: # %cond.load22
; RV64ZVE32F-NEXT: vsetivli zero, 9, e8, m1, tu, ma
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -12305,36 +12306,27 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
; RV64ZVE32F-NEXT: andi a2, a1, 512
; RV64ZVE32F-NEXT: bnez a2, .LBB97_14
; RV64ZVE32F-NEXT: j .LBB97_15
-; RV64ZVE32F-NEXT: .LBB97_30: # %cond.load28
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v11, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v11, 10
-; RV64ZVE32F-NEXT: slli a2, a1, 52
-; RV64ZVE32F-NEXT: bgez a2, .LBB97_17
; RV64ZVE32F-NEXT: .LBB97_31: # %cond.load31
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
; RV64ZVE32F-NEXT: vsetivli zero, 12, e8, m1, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 11
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 11
; RV64ZVE32F-NEXT: slli a2, a1, 51
-; RV64ZVE32F-NEXT: bgez a2, .LBB97_18
+; RV64ZVE32F-NEXT: bgez a2, .LBB97_19
; RV64ZVE32F-NEXT: .LBB97_32: # %cond.load34
; RV64ZVE32F-NEXT: vsetivli zero, 13, e8, m1, tu, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: vmv.s.x v8, a2
-; RV64ZVE32F-NEXT: vslideup.vi v9, v8, 12
+; RV64ZVE32F-NEXT: vmv.s.x v10, a2
+; RV64ZVE32F-NEXT: vslideup.vi v9, v10, 12
; RV64ZVE32F-NEXT: slli a2, a1, 50
-; RV64ZVE32F-NEXT: bltz a2, .LBB97_19
-; RV64ZVE32F-NEXT: j .LBB97_20
+; RV64ZVE32F-NEXT: bltz a2, .LBB97_20
+; RV64ZVE32F-NEXT: j .LBB97_21
%ptrs = getelementptr inbounds i8, ptr %base, <16 x i8> %idxs
%v = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %ptrs, i32 2, <16 x i1> %m, <16 x i8> %passthru)
ret <16 x i8> %v
@@ -12403,37 +12395,33 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 1
; RV64ZVE32F-NEXT: .LBB98_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB98_49
-; RV64ZVE32F-NEXT: # %bb.5: # %else5
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: bnez a2, .LBB98_50
-; RV64ZVE32F-NEXT: .LBB98_6: # %else8
-; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB98_8
-; RV64ZVE32F-NEXT: .LBB98_7: # %cond.load10
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v13
+; RV64ZVE32F-NEXT: beqz a2, .LBB98_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.load4
+; RV64ZVE32F-NEXT: vmv.x.s a2, v12
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 4
+; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2
+; RV64ZVE32F-NEXT: .LBB98_6: # %else5
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v13, v8, 4
+; RV64ZVE32F-NEXT: bnez a2, .LBB98_50
+; RV64ZVE32F-NEXT: # %bb.7: # %else8
+; RV64ZVE32F-NEXT: andi a2, a1, 16
+; RV64ZVE32F-NEXT: bnez a2, .LBB98_51
; RV64ZVE32F-NEXT: .LBB98_8: # %else11
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB98_10
-; RV64ZVE32F-NEXT: # %bb.9: # %cond.load13
+; RV64ZVE32F-NEXT: .LBB98_9: # %cond.load13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v14, v13, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v14
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v12
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
@@ -12441,16 +12429,18 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 6, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 5
; RV64ZVE32F-NEXT: .LBB98_10: # %else14
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 8
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB98_51
+; RV64ZVE32F-NEXT: bnez a2, .LBB98_52
; RV64ZVE32F-NEXT: # %bb.11: # %else17
; RV64ZVE32F-NEXT: andi a2, a1, 128
-; RV64ZVE32F-NEXT: bnez a2, .LBB98_52
+; RV64ZVE32F-NEXT: bnez a2, .LBB98_53
; RV64ZVE32F-NEXT: .LBB98_12: # %else20
; RV64ZVE32F-NEXT: andi a2, a1, 256
-; RV64ZVE32F-NEXT: bnez a2, .LBB98_53
+; RV64ZVE32F-NEXT: bnez a2, .LBB98_54
; RV64ZVE32F-NEXT: .LBB98_13: # %else23
; RV64ZVE32F-NEXT: andi a2, a1, 512
; RV64ZVE32F-NEXT: beqz a2, .LBB98_15
@@ -12465,14 +12455,12 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 10, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 9
; RV64ZVE32F-NEXT: .LBB98_15: # %else26
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 1024
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2
; RV64ZVE32F-NEXT: beqz a2, .LBB98_17
; RV64ZVE32F-NEXT: # %bb.16: # %cond.load28
-; RV64ZVE32F-NEXT: vmv.x.s a2, v12
+; RV64ZVE32F-NEXT: vmv.x.s a2, v13
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
@@ -12480,12 +12468,14 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 11, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 10
; RV64ZVE32F-NEXT: .LBB98_17: # %else29
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 52
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4
; RV64ZVE32F-NEXT: bgez a2, .LBB98_19
; RV64ZVE32F-NEXT: # %bb.18: # %cond.load31
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v12
+; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v13
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
@@ -12498,7 +12488,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 16
; RV64ZVE32F-NEXT: bgez a2, .LBB98_21
; RV64ZVE32F-NEXT: # %bb.20: # %cond.load34
-; RV64ZVE32F-NEXT: vmv.x.s a2, v13
+; RV64ZVE32F-NEXT: vmv.x.s a2, v12
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
@@ -12510,7 +12500,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: bgez a2, .LBB98_23
; RV64ZVE32F-NEXT: # %bb.22: # %cond.load37
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v13, 1
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
@@ -12521,14 +12511,14 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: .LBB98_23: # %else38
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 49
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v13, 2
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_54
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v12, 2
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_55
; RV64ZVE32F-NEXT: # %bb.24: # %else41
; RV64ZVE32F-NEXT: slli a2, a1, 48
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_55
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_56
; RV64ZVE32F-NEXT: .LBB98_25: # %else44
; RV64ZVE32F-NEXT: slli a2, a1, 47
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_56
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_57
; RV64ZVE32F-NEXT: .LBB98_26: # %else47
; RV64ZVE32F-NEXT: slli a2, a1, 46
; RV64ZVE32F-NEXT: bgez a2, .LBB98_28
@@ -12543,34 +12533,30 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 18, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 17
; RV64ZVE32F-NEXT: .LBB98_28: # %else50
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 45
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_57
-; RV64ZVE32F-NEXT: # %bb.29: # %else53
-; RV64ZVE32F-NEXT: slli a2, a1, 44
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_58
-; RV64ZVE32F-NEXT: .LBB98_30: # %else56
-; RV64ZVE32F-NEXT: slli a2, a1, 43
-; RV64ZVE32F-NEXT: bgez a2, .LBB98_32
-; RV64ZVE32F-NEXT: .LBB98_31: # %cond.load58
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: bgez a2, .LBB98_30
+; RV64ZVE32F-NEXT: # %bb.29: # %cond.load52
+; RV64ZVE32F-NEXT: vmv.x.s a2, v12
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 21, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 20
+; RV64ZVE32F-NEXT: vmv.s.x v14, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 19, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 18
+; RV64ZVE32F-NEXT: .LBB98_30: # %else53
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: slli a2, a1, 44
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_58
+; RV64ZVE32F-NEXT: # %bb.31: # %else56
+; RV64ZVE32F-NEXT: slli a2, a1, 43
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_59
; RV64ZVE32F-NEXT: .LBB98_32: # %else59
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 42
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8
; RV64ZVE32F-NEXT: bgez a2, .LBB98_34
-; RV64ZVE32F-NEXT: # %bb.33: # %cond.load61
+; RV64ZVE32F-NEXT: .LBB98_33: # %cond.load61
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v9, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v12
@@ -12581,16 +12567,18 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 22, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 21
; RV64ZVE32F-NEXT: .LBB98_34: # %else62
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 8
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 41
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_59
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_60
; RV64ZVE32F-NEXT: # %bb.35: # %else65
; RV64ZVE32F-NEXT: slli a2, a1, 40
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_60
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_61
; RV64ZVE32F-NEXT: .LBB98_36: # %else68
; RV64ZVE32F-NEXT: slli a2, a1, 39
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_61
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_62
; RV64ZVE32F-NEXT: .LBB98_37: # %else71
; RV64ZVE32F-NEXT: slli a2, a1, 38
; RV64ZVE32F-NEXT: bgez a2, .LBB98_39
@@ -12605,37 +12593,45 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 26, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 25
; RV64ZVE32F-NEXT: .LBB98_39: # %else74
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 37
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_62
-; RV64ZVE32F-NEXT: # %bb.40: # %else77
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: bgez a2, .LBB98_41
+; RV64ZVE32F-NEXT: # %bb.40: # %cond.load76
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: li a3, 32
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 27, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 26
+; RV64ZVE32F-NEXT: .LBB98_41: # %else77
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 36
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bltz a2, .LBB98_63
-; RV64ZVE32F-NEXT: .LBB98_41: # %else80
+; RV64ZVE32F-NEXT: # %bb.42: # %else80
; RV64ZVE32F-NEXT: slli a2, a1, 35
; RV64ZVE32F-NEXT: bltz a2, .LBB98_64
-; RV64ZVE32F-NEXT: .LBB98_42: # %else83
+; RV64ZVE32F-NEXT: .LBB98_43: # %else83
; RV64ZVE32F-NEXT: slli a2, a1, 34
-; RV64ZVE32F-NEXT: bgez a2, .LBB98_44
-; RV64ZVE32F-NEXT: .LBB98_43: # %cond.load85
+; RV64ZVE32F-NEXT: bgez a2, .LBB98_45
+; RV64ZVE32F-NEXT: .LBB98_44: # %cond.load85
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 30, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 29
-; RV64ZVE32F-NEXT: .LBB98_44: # %else86
+; RV64ZVE32F-NEXT: .LBB98_45: # %else86
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 33
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
-; RV64ZVE32F-NEXT: bgez a2, .LBB98_46
-; RV64ZVE32F-NEXT: # %bb.45: # %cond.load88
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
+; RV64ZVE32F-NEXT: bgez a2, .LBB98_47
+; RV64ZVE32F-NEXT: # %bb.46: # %cond.load88
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
@@ -12643,11 +12639,11 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vmv.s.x v12, a2
; RV64ZVE32F-NEXT: vsetivli zero, 31, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 30
-; RV64ZVE32F-NEXT: .LBB98_46: # %else89
+; RV64ZVE32F-NEXT: .LBB98_47: # %else89
; RV64ZVE32F-NEXT: lui a2, 524288
; RV64ZVE32F-NEXT: and a1, a1, a2
-; RV64ZVE32F-NEXT: beqz a1, .LBB98_48
-; RV64ZVE32F-NEXT: # %bb.47: # %cond.load91
+; RV64ZVE32F-NEXT: beqz a1, .LBB98_49
+; RV64ZVE32F-NEXT: # %bb.48: # %cond.load91
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
@@ -12657,19 +12653,9 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vmv.s.x v8, a0
; RV64ZVE32F-NEXT: vsetvli zero, a1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v8, 31
-; RV64ZVE32F-NEXT: .LBB98_48: # %else92
+; RV64ZVE32F-NEXT: .LBB98_49: # %else92
; RV64ZVE32F-NEXT: vmv2r.v v8, v10
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB98_49: # %cond.load4
-; RV64ZVE32F-NEXT: vmv.x.s a2, v12
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 3, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 2
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB98_6
; RV64ZVE32F-NEXT: .LBB98_50: # %cond.load7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1
@@ -12681,9 +12667,21 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 3
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: bnez a2, .LBB98_7
-; RV64ZVE32F-NEXT: j .LBB98_8
-; RV64ZVE32F-NEXT: .LBB98_51: # %cond.load16
+; RV64ZVE32F-NEXT: beqz a2, .LBB98_8
+; RV64ZVE32F-NEXT: .LBB98_51: # %cond.load10
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v13
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: li a3, 32
+; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv.s.x v14, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 5, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 4
+; RV64ZVE32F-NEXT: andi a2, a1, 32
+; RV64ZVE32F-NEXT: bnez a2, .LBB98_9
+; RV64ZVE32F-NEXT: j .LBB98_10
+; RV64ZVE32F-NEXT: .LBB98_52: # %cond.load16
; RV64ZVE32F-NEXT: vmv.x.s a2, v13
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
@@ -12693,7 +12691,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 6
; RV64ZVE32F-NEXT: andi a2, a1, 128
; RV64ZVE32F-NEXT: beqz a2, .LBB98_12
-; RV64ZVE32F-NEXT: .LBB98_52: # %cond.load19
+; RV64ZVE32F-NEXT: .LBB98_53: # %cond.load19
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v13
@@ -12705,7 +12703,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 7
; RV64ZVE32F-NEXT: andi a2, a1, 256
; RV64ZVE32F-NEXT: beqz a2, .LBB98_13
-; RV64ZVE32F-NEXT: .LBB98_53: # %cond.load22
+; RV64ZVE32F-NEXT: .LBB98_54: # %cond.load22
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vmv.x.s a2, v12
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -12718,7 +12716,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: andi a2, a1, 512
; RV64ZVE32F-NEXT: bnez a2, .LBB98_14
; RV64ZVE32F-NEXT: j .LBB98_15
-; RV64ZVE32F-NEXT: .LBB98_54: # %cond.load40
+; RV64ZVE32F-NEXT: .LBB98_55: # %cond.load40
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
@@ -12728,7 +12726,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 14
; RV64ZVE32F-NEXT: slli a2, a1, 48
; RV64ZVE32F-NEXT: bgez a2, .LBB98_25
-; RV64ZVE32F-NEXT: .LBB98_55: # %cond.load43
+; RV64ZVE32F-NEXT: .LBB98_56: # %cond.load43
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
@@ -12740,7 +12738,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 15
; RV64ZVE32F-NEXT: slli a2, a1, 47
; RV64ZVE32F-NEXT: bgez a2, .LBB98_26
-; RV64ZVE32F-NEXT: .LBB98_56: # %cond.load46
+; RV64ZVE32F-NEXT: .LBB98_57: # %cond.load46
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -12753,16 +12751,6 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: slli a2, a1, 46
; RV64ZVE32F-NEXT: bltz a2, .LBB98_27
; RV64ZVE32F-NEXT: j .LBB98_28
-; RV64ZVE32F-NEXT: .LBB98_57: # %cond.load52
-; RV64ZVE32F-NEXT: vmv.x.s a2, v12
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v14, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 19, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v14, 18
-; RV64ZVE32F-NEXT: slli a2, a1, 44
-; RV64ZVE32F-NEXT: bgez a2, .LBB98_30
; RV64ZVE32F-NEXT: .LBB98_58: # %cond.load55
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1
@@ -12774,9 +12762,21 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 20, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 19
; RV64ZVE32F-NEXT: slli a2, a1, 43
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_31
-; RV64ZVE32F-NEXT: j .LBB98_32
-; RV64ZVE32F-NEXT: .LBB98_59: # %cond.load64
+; RV64ZVE32F-NEXT: bgez a2, .LBB98_32
+; RV64ZVE32F-NEXT: .LBB98_59: # %cond.load58
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: lbu a2, 0(a2)
+; RV64ZVE32F-NEXT: li a3, 32
+; RV64ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv.s.x v12, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 21, e8, m2, tu, ma
+; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 20
+; RV64ZVE32F-NEXT: slli a2, a1, 42
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_33
+; RV64ZVE32F-NEXT: j .LBB98_34
+; RV64ZVE32F-NEXT: .LBB98_60: # %cond.load64
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
@@ -12786,7 +12786,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 22
; RV64ZVE32F-NEXT: slli a2, a1, 40
; RV64ZVE32F-NEXT: bgez a2, .LBB98_36
-; RV64ZVE32F-NEXT: .LBB98_60: # %cond.load67
+; RV64ZVE32F-NEXT: .LBB98_61: # %cond.load67
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
@@ -12798,7 +12798,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 23
; RV64ZVE32F-NEXT: slli a2, a1, 39
; RV64ZVE32F-NEXT: bgez a2, .LBB98_37
-; RV64ZVE32F-NEXT: .LBB98_61: # %cond.load70
+; RV64ZVE32F-NEXT: .LBB98_62: # %cond.load70
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -12811,20 +12811,10 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: slli a2, a1, 38
; RV64ZVE32F-NEXT: bltz a2, .LBB98_38
; RV64ZVE32F-NEXT: j .LBB98_39
-; RV64ZVE32F-NEXT: .LBB98_62: # %cond.load76
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: lbu a2, 0(a2)
-; RV64ZVE32F-NEXT: li a3, 32
-; RV64ZVE32F-NEXT: vmv.s.x v12, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 27, e8, m2, tu, ma
-; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 26
-; RV64ZVE32F-NEXT: slli a2, a1, 36
-; RV64ZVE32F-NEXT: bgez a2, .LBB98_41
; RV64ZVE32F-NEXT: .LBB98_63: # %cond.load79
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
@@ -12832,10 +12822,10 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 28, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 27
; RV64ZVE32F-NEXT: slli a2, a1, 35
-; RV64ZVE32F-NEXT: bgez a2, .LBB98_42
+; RV64ZVE32F-NEXT: bgez a2, .LBB98_43
; RV64ZVE32F-NEXT: .LBB98_64: # %cond.load82
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: lbu a2, 0(a2)
; RV64ZVE32F-NEXT: li a3, 32
@@ -12844,8 +12834,8 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
; RV64ZVE32F-NEXT: vsetivli zero, 29, e8, m2, tu, ma
; RV64ZVE32F-NEXT: vslideup.vi v10, v12, 28
; RV64ZVE32F-NEXT: slli a2, a1, 34
-; RV64ZVE32F-NEXT: bltz a2, .LBB98_43
-; RV64ZVE32F-NEXT: j .LBB98_44
+; RV64ZVE32F-NEXT: bltz a2, .LBB98_44
+; RV64ZVE32F-NEXT: j .LBB98_45
%ptrs = getelementptr inbounds i8, ptr %base, <32 x i8> %idxs
%v = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %ptrs, i32 2, <32 x i1> %m, <32 x i8> %passthru)
ret <32 x i8> %v
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
index 86b88c181679c8..63c2b115d02304 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll
@@ -532,66 +532,64 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse8.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB9_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB9_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB9_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT: vse8.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB9_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB9_13
-; RV64ZVE32F-NEXT: .LBB9_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB9_14
-; RV64ZVE32F-NEXT: .LBB9_7: # %else8
+; RV64ZVE32F-NEXT: .LBB9_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB9_9
-; RV64ZVE32F-NEXT: .LBB9_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB9_10
+; RV64ZVE32F-NEXT: .LBB9_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT: vse8.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB9_9: # %else10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5
+; RV64ZVE32F-NEXT: vse8.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB9_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB9_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB9_16
-; RV64ZVE32F-NEXT: .LBB9_11: # %else14
+; RV64ZVE32F-NEXT: .LBB9_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB9_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: vse8.v v11, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB9_6
; RV64ZVE32F-NEXT: .LBB9_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3
-; RV64ZVE32F-NEXT: vse8.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vse8.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB9_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB9_8
; RV64ZVE32F-NEXT: .LBB9_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vse8.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vse8.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB9_8
-; RV64ZVE32F-NEXT: j .LBB9_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB9_9
+; RV64ZVE32F-NEXT: j .LBB9_10
; RV64ZVE32F-NEXT: .LBB9_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -599,7 +597,7 @@ define void @mscatter_baseidx_v8i8(<8 x i8> %val, ptr %base, <8 x i8> %idxs, <8
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6
; RV64ZVE32F-NEXT: vse8.v v10, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB9_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB9_12
; RV64ZVE32F-NEXT: .LBB9_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
@@ -1083,71 +1081,69 @@ define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %id
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB18_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB18_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB18_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT: vse16.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB18_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB18_13
-; RV64ZVE32F-NEXT: .LBB18_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB18_14
-; RV64ZVE32F-NEXT: .LBB18_7: # %else8
+; RV64ZVE32F-NEXT: .LBB18_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB18_9
-; RV64ZVE32F-NEXT: .LBB18_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB18_10
+; RV64ZVE32F-NEXT: .LBB18_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB18_9: # %else10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB18_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB18_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB18_16
-; RV64ZVE32F-NEXT: .LBB18_11: # %else14
+; RV64ZVE32F-NEXT: .LBB18_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB18_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v11, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB18_6
; RV64ZVE32F-NEXT: .LBB18_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB18_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB18_8
; RV64ZVE32F-NEXT: .LBB18_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB18_8
-; RV64ZVE32F-NEXT: j .LBB18_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB18_9
+; RV64ZVE32F-NEXT: j .LBB18_10
; RV64ZVE32F-NEXT: .LBB18_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -1156,7 +1152,7 @@ define void @mscatter_baseidx_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8> %id
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB18_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB18_12
; RV64ZVE32F-NEXT: .LBB18_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
@@ -1216,71 +1212,69 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB19_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB19_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB19_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT: vse16.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB19_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB19_13
-; RV64ZVE32F-NEXT: .LBB19_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB19_14
-; RV64ZVE32F-NEXT: .LBB19_7: # %else8
+; RV64ZVE32F-NEXT: .LBB19_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB19_9
-; RV64ZVE32F-NEXT: .LBB19_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB19_10
+; RV64ZVE32F-NEXT: .LBB19_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB19_9: # %else10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB19_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB19_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB19_16
-; RV64ZVE32F-NEXT: .LBB19_11: # %else14
+; RV64ZVE32F-NEXT: .LBB19_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB19_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v11, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB19_6
; RV64ZVE32F-NEXT: .LBB19_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB19_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB19_8
; RV64ZVE32F-NEXT: .LBB19_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB19_8
-; RV64ZVE32F-NEXT: j .LBB19_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB19_9
+; RV64ZVE32F-NEXT: j .LBB19_10
; RV64ZVE32F-NEXT: .LBB19_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -1289,7 +1283,7 @@ define void @mscatter_baseidx_sext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB19_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB19_12
; RV64ZVE32F-NEXT: .LBB19_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
@@ -1352,75 +1346,73 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB20_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB20_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB20_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: andi a2, a2, 255
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT: vse16.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB20_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB20_13
-; RV64ZVE32F-NEXT: .LBB20_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB20_14
-; RV64ZVE32F-NEXT: .LBB20_7: # %else8
+; RV64ZVE32F-NEXT: .LBB20_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB20_9
-; RV64ZVE32F-NEXT: .LBB20_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB20_10
+; RV64ZVE32F-NEXT: .LBB20_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB20_9: # %else10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB20_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB20_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB20_16
-; RV64ZVE32F-NEXT: .LBB20_11: # %else14
+; RV64ZVE32F-NEXT: .LBB20_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB20_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: andi a2, a2, 255
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v11, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB20_6
; RV64ZVE32F-NEXT: .LBB20_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB20_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB20_8
; RV64ZVE32F-NEXT: .LBB20_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB20_8
-; RV64ZVE32F-NEXT: j .LBB20_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB20_9
+; RV64ZVE32F-NEXT: j .LBB20_10
; RV64ZVE32F-NEXT: .LBB20_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
@@ -1430,7 +1422,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i16(<8 x i16> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB20_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB20_12
; RV64ZVE32F-NEXT: .LBB20_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
@@ -1491,70 +1483,68 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB21_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB21_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB21_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT: vse16.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB21_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB21_13
-; RV64ZVE32F-NEXT: .LBB21_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB21_14
-; RV64ZVE32F-NEXT: .LBB21_7: # %else8
+; RV64ZVE32F-NEXT: .LBB21_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB21_9
-; RV64ZVE32F-NEXT: .LBB21_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB21_10
+; RV64ZVE32F-NEXT: .LBB21_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB21_9: # %else10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB21_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB21_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB21_16
-; RV64ZVE32F-NEXT: .LBB21_11: # %else14
+; RV64ZVE32F-NEXT: .LBB21_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB21_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v11, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB21_6
; RV64ZVE32F-NEXT: .LBB21_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB21_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB21_8
; RV64ZVE32F-NEXT: .LBB21_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB21_8
-; RV64ZVE32F-NEXT: j .LBB21_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB21_9
+; RV64ZVE32F-NEXT: j .LBB21_10
; RV64ZVE32F-NEXT: .LBB21_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -1563,7 +1553,7 @@ define void @mscatter_baseidx_v8i16(<8 x i16> %val, ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB21_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB21_12
; RV64ZVE32F-NEXT: .LBB21_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
@@ -1991,71 +1981,69 @@ define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %id
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB29_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB29_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB29_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB29_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB29_13
-; RV64ZVE32F-NEXT: .LBB29_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB29_14
-; RV64ZVE32F-NEXT: .LBB29_7: # %else8
+; RV64ZVE32F-NEXT: .LBB29_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB29_9
-; RV64ZVE32F-NEXT: .LBB29_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB29_10
+; RV64ZVE32F-NEXT: .LBB29_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB29_9: # %else10
+; RV64ZVE32F-NEXT: .LBB29_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB29_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB29_16
-; RV64ZVE32F-NEXT: .LBB29_11: # %else14
+; RV64ZVE32F-NEXT: .LBB29_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB29_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB29_6
; RV64ZVE32F-NEXT: .LBB29_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB29_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB29_8
; RV64ZVE32F-NEXT: .LBB29_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB29_8
-; RV64ZVE32F-NEXT: j .LBB29_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB29_9
+; RV64ZVE32F-NEXT: j .LBB29_10
; RV64ZVE32F-NEXT: .LBB29_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -2064,7 +2052,7 @@ define void @mscatter_baseidx_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8> %id
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB29_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB29_12
; RV64ZVE32F-NEXT: .LBB29_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -2123,71 +2111,69 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB30_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB30_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB30_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB30_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB30_13
-; RV64ZVE32F-NEXT: .LBB30_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB30_14
-; RV64ZVE32F-NEXT: .LBB30_7: # %else8
+; RV64ZVE32F-NEXT: .LBB30_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB30_9
-; RV64ZVE32F-NEXT: .LBB30_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB30_10
+; RV64ZVE32F-NEXT: .LBB30_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB30_9: # %else10
+; RV64ZVE32F-NEXT: .LBB30_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB30_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB30_16
-; RV64ZVE32F-NEXT: .LBB30_11: # %else14
+; RV64ZVE32F-NEXT: .LBB30_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB30_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB30_6
; RV64ZVE32F-NEXT: .LBB30_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB30_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB30_8
; RV64ZVE32F-NEXT: .LBB30_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB30_8
-; RV64ZVE32F-NEXT: j .LBB30_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB30_9
+; RV64ZVE32F-NEXT: j .LBB30_10
; RV64ZVE32F-NEXT: .LBB30_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -2196,7 +2182,7 @@ define void @mscatter_baseidx_sext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB30_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB30_12
; RV64ZVE32F-NEXT: .LBB30_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -2258,55 +2244,53 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB31_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB31_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB31_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: andi a2, a2, 255
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB31_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB31_13
-; RV64ZVE32F-NEXT: .LBB31_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB31_14
-; RV64ZVE32F-NEXT: .LBB31_7: # %else8
+; RV64ZVE32F-NEXT: .LBB31_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB31_9
-; RV64ZVE32F-NEXT: .LBB31_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB31_10
+; RV64ZVE32F-NEXT: .LBB31_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB31_9: # %else10
+; RV64ZVE32F-NEXT: .LBB31_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB31_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB31_16
-; RV64ZVE32F-NEXT: .LBB31_11: # %else14
+; RV64ZVE32F-NEXT: .LBB31_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB31_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: andi a2, a2, 255
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB31_6
; RV64ZVE32F-NEXT: .LBB31_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -2314,10 +2298,10 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB31_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB31_8
; RV64ZVE32F-NEXT: .LBB31_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -2325,8 +2309,8 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB31_8
-; RV64ZVE32F-NEXT: j .LBB31_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB31_9
+; RV64ZVE32F-NEXT: j .LBB31_10
; RV64ZVE32F-NEXT: .LBB31_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
@@ -2336,7 +2320,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i32(<8 x i32> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB31_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB31_12
; RV64ZVE32F-NEXT: .LBB31_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -2398,71 +2382,69 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB32_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB32_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB32_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB32_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB32_13
-; RV64ZVE32F-NEXT: .LBB32_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB32_14
-; RV64ZVE32F-NEXT: .LBB32_7: # %else8
+; RV64ZVE32F-NEXT: .LBB32_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB32_9
-; RV64ZVE32F-NEXT: .LBB32_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB32_10
+; RV64ZVE32F-NEXT: .LBB32_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB32_9: # %else10
+; RV64ZVE32F-NEXT: .LBB32_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB32_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB32_16
-; RV64ZVE32F-NEXT: .LBB32_11: # %else14
+; RV64ZVE32F-NEXT: .LBB32_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB32_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB32_6
; RV64ZVE32F-NEXT: .LBB32_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB32_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB32_8
; RV64ZVE32F-NEXT: .LBB32_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB32_8
-; RV64ZVE32F-NEXT: j .LBB32_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB32_9
+; RV64ZVE32F-NEXT: j .LBB32_10
; RV64ZVE32F-NEXT: .LBB32_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -2471,7 +2453,7 @@ define void @mscatter_baseidx_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i16> %
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB32_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB32_12
; RV64ZVE32F-NEXT: .LBB32_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -2531,71 +2513,69 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB33_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB33_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB33_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB33_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB33_13
-; RV64ZVE32F-NEXT: .LBB33_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB33_14
-; RV64ZVE32F-NEXT: .LBB33_7: # %else8
+; RV64ZVE32F-NEXT: .LBB33_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB33_9
-; RV64ZVE32F-NEXT: .LBB33_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB33_10
+; RV64ZVE32F-NEXT: .LBB33_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB33_9: # %else10
+; RV64ZVE32F-NEXT: .LBB33_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB33_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB33_16
-; RV64ZVE32F-NEXT: .LBB33_11: # %else14
+; RV64ZVE32F-NEXT: .LBB33_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB33_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB33_6
; RV64ZVE32F-NEXT: .LBB33_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB33_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB33_8
; RV64ZVE32F-NEXT: .LBB33_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB33_8
-; RV64ZVE32F-NEXT: j .LBB33_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB33_9
+; RV64ZVE32F-NEXT: j .LBB33_10
; RV64ZVE32F-NEXT: .LBB33_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -2604,7 +2584,7 @@ define void @mscatter_baseidx_sext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB33_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB33_12
; RV64ZVE32F-NEXT: .LBB33_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -2669,55 +2649,53 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
; RV64ZVE32F-NEXT: .LBB34_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB34_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB34_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
+; RV64ZVE32F-NEXT: and a3, a3, a1
+; RV64ZVE32F-NEXT: slli a3, a3, 2
+; RV64ZVE32F-NEXT: add a3, a0, a3
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a3)
+; RV64ZVE32F-NEXT: .LBB34_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB34_13
-; RV64ZVE32F-NEXT: .LBB34_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB34_14
-; RV64ZVE32F-NEXT: .LBB34_7: # %else8
+; RV64ZVE32F-NEXT: .LBB34_8: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB34_9
-; RV64ZVE32F-NEXT: .LBB34_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a3, .LBB34_10
+; RV64ZVE32F-NEXT: .LBB34_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
-; RV64ZVE32F-NEXT: .LBB34_9: # %else10
+; RV64ZVE32F-NEXT: .LBB34_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a3, .LBB34_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a2, a2, -128
; RV64ZVE32F-NEXT: bnez a2, .LBB34_16
-; RV64ZVE32F-NEXT: .LBB34_11: # %else14
+; RV64ZVE32F-NEXT: .LBB34_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB34_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a3, v10
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 2
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a3)
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB34_6
; RV64ZVE32F-NEXT: .LBB34_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
@@ -2725,10 +2703,10 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB34_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB34_8
; RV64ZVE32F-NEXT: .LBB34_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a3, v11
+; RV64ZVE32F-NEXT: vmv.x.s a3, v10
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
@@ -2736,8 +2714,8 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB34_8
-; RV64ZVE32F-NEXT: j .LBB34_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB34_9
+; RV64ZVE32F-NEXT: j .LBB34_10
; RV64ZVE32F-NEXT: .LBB34_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a3, v10
; RV64ZVE32F-NEXT: and a3, a3, a1
@@ -2747,7 +2725,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i32(<8 x i32> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB34_11
+; RV64ZVE32F-NEXT: beqz a2, .LBB34_12
; RV64ZVE32F-NEXT: .LBB34_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -3641,66 +3619,68 @@ define void @mscatter_baseidx_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8> %id
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd t1, 0(a0)
; RV64ZVE32F-NEXT: .LBB42_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a0, .LBB42_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB42_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: slli a0, a0, 3
+; RV64ZVE32F-NEXT: add a0, a1, a0
+; RV64ZVE32F-NEXT: sd t0, 0(a0)
+; RV64ZVE32F-NEXT: .LBB42_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB42_13
-; RV64ZVE32F-NEXT: .LBB42_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a0, a4, 16
; RV64ZVE32F-NEXT: bnez a0, .LBB42_14
-; RV64ZVE32F-NEXT: .LBB42_7: # %else8
+; RV64ZVE32F-NEXT: .LBB42_8: # %else8
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: beqz a0, .LBB42_9
-; RV64ZVE32F-NEXT: .LBB42_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: beqz a0, .LBB42_10
+; RV64ZVE32F-NEXT: .LBB42_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a5, 0(a0)
-; RV64ZVE32F-NEXT: .LBB42_9: # %else10
+; RV64ZVE32F-NEXT: .LBB42_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB42_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a0, a4, -128
; RV64ZVE32F-NEXT: bnez a0, .LBB42_16
-; RV64ZVE32F-NEXT: .LBB42_11: # %else14
+; RV64ZVE32F-NEXT: .LBB42_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB42_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
-; RV64ZVE32F-NEXT: slli a0, a0, 3
-; RV64ZVE32F-NEXT: add a0, a1, a0
-; RV64ZVE32F-NEXT: sd t0, 0(a0)
-; RV64ZVE32F-NEXT: andi a0, a4, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB42_6
; RV64ZVE32F-NEXT: .LBB42_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a7, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 16
-; RV64ZVE32F-NEXT: beqz a0, .LBB42_7
+; RV64ZVE32F-NEXT: beqz a0, .LBB42_8
; RV64ZVE32F-NEXT: .LBB42_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a6, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: bnez a0, .LBB42_8
-; RV64ZVE32F-NEXT: j .LBB42_9
+; RV64ZVE32F-NEXT: bnez a0, .LBB42_9
+; RV64ZVE32F-NEXT: j .LBB42_10
; RV64ZVE32F-NEXT: .LBB42_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, -128
-; RV64ZVE32F-NEXT: beqz a0, .LBB42_11
+; RV64ZVE32F-NEXT: beqz a0, .LBB42_12
; RV64ZVE32F-NEXT: .LBB42_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
@@ -3885,66 +3865,68 @@ define void @mscatter_baseidx_sext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd t1, 0(a0)
; RV64ZVE32F-NEXT: .LBB43_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a0, .LBB43_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB43_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: slli a0, a0, 3
+; RV64ZVE32F-NEXT: add a0, a1, a0
+; RV64ZVE32F-NEXT: sd t0, 0(a0)
+; RV64ZVE32F-NEXT: .LBB43_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB43_13
-; RV64ZVE32F-NEXT: .LBB43_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a0, a4, 16
; RV64ZVE32F-NEXT: bnez a0, .LBB43_14
-; RV64ZVE32F-NEXT: .LBB43_7: # %else8
+; RV64ZVE32F-NEXT: .LBB43_8: # %else8
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: beqz a0, .LBB43_9
-; RV64ZVE32F-NEXT: .LBB43_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: beqz a0, .LBB43_10
+; RV64ZVE32F-NEXT: .LBB43_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a5, 0(a0)
-; RV64ZVE32F-NEXT: .LBB43_9: # %else10
+; RV64ZVE32F-NEXT: .LBB43_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB43_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a0, a4, -128
; RV64ZVE32F-NEXT: bnez a0, .LBB43_16
-; RV64ZVE32F-NEXT: .LBB43_11: # %else14
+; RV64ZVE32F-NEXT: .LBB43_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB43_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
-; RV64ZVE32F-NEXT: slli a0, a0, 3
-; RV64ZVE32F-NEXT: add a0, a1, a0
-; RV64ZVE32F-NEXT: sd t0, 0(a0)
-; RV64ZVE32F-NEXT: andi a0, a4, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB43_6
; RV64ZVE32F-NEXT: .LBB43_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a7, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 16
-; RV64ZVE32F-NEXT: beqz a0, .LBB43_7
+; RV64ZVE32F-NEXT: beqz a0, .LBB43_8
; RV64ZVE32F-NEXT: .LBB43_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a6, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: bnez a0, .LBB43_8
-; RV64ZVE32F-NEXT: j .LBB43_9
+; RV64ZVE32F-NEXT: bnez a0, .LBB43_9
+; RV64ZVE32F-NEXT: j .LBB43_10
; RV64ZVE32F-NEXT: .LBB43_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, -128
-; RV64ZVE32F-NEXT: beqz a0, .LBB43_11
+; RV64ZVE32F-NEXT: beqz a0, .LBB43_12
; RV64ZVE32F-NEXT: .LBB43_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
@@ -4132,63 +4114,65 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd t1, 0(a0)
; RV64ZVE32F-NEXT: .LBB44_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a0, .LBB44_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB44_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: andi a0, a0, 255
+; RV64ZVE32F-NEXT: slli a0, a0, 3
+; RV64ZVE32F-NEXT: add a0, a1, a0
+; RV64ZVE32F-NEXT: sd t0, 0(a0)
+; RV64ZVE32F-NEXT: .LBB44_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB44_13
-; RV64ZVE32F-NEXT: .LBB44_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a0, a4, 16
; RV64ZVE32F-NEXT: bnez a0, .LBB44_14
-; RV64ZVE32F-NEXT: .LBB44_7: # %else8
+; RV64ZVE32F-NEXT: .LBB44_8: # %else8
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: beqz a0, .LBB44_9
-; RV64ZVE32F-NEXT: .LBB44_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: beqz a0, .LBB44_10
+; RV64ZVE32F-NEXT: .LBB44_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: andi a0, a0, 255
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a5, 0(a0)
-; RV64ZVE32F-NEXT: .LBB44_9: # %else10
+; RV64ZVE32F-NEXT: .LBB44_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB44_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a0, a4, -128
; RV64ZVE32F-NEXT: bnez a0, .LBB44_16
-; RV64ZVE32F-NEXT: .LBB44_11: # %else14
+; RV64ZVE32F-NEXT: .LBB44_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB44_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
-; RV64ZVE32F-NEXT: andi a0, a0, 255
-; RV64ZVE32F-NEXT: slli a0, a0, 3
-; RV64ZVE32F-NEXT: add a0, a1, a0
-; RV64ZVE32F-NEXT: sd t0, 0(a0)
-; RV64ZVE32F-NEXT: andi a0, a4, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB44_6
-; RV64ZVE32F-NEXT: .LBB44_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: .LBB44_13: # %cond.store5
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: andi a0, a0, 255
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a7, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 16
-; RV64ZVE32F-NEXT: beqz a0, .LBB44_7
+; RV64ZVE32F-NEXT: beqz a0, .LBB44_8
; RV64ZVE32F-NEXT: .LBB44_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: andi a0, a0, 255
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a6, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: bnez a0, .LBB44_8
-; RV64ZVE32F-NEXT: j .LBB44_9
+; RV64ZVE32F-NEXT: bnez a0, .LBB44_9
+; RV64ZVE32F-NEXT: j .LBB44_10
; RV64ZVE32F-NEXT: .LBB44_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: andi a0, a0, 255
@@ -4196,7 +4180,7 @@ define void @mscatter_baseidx_zext_v8i8_v8i64(<8 x i64> %val, ptr %base, <8 x i8
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, -128
-; RV64ZVE32F-NEXT: beqz a0, .LBB44_11
+; RV64ZVE32F-NEXT: beqz a0, .LBB44_12
; RV64ZVE32F-NEXT: .LBB44_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
@@ -4384,66 +4368,68 @@ define void @mscatter_baseidx_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i16> %
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd t1, 0(a0)
; RV64ZVE32F-NEXT: .LBB45_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a0, .LBB45_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB45_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: slli a0, a0, 3
+; RV64ZVE32F-NEXT: add a0, a1, a0
+; RV64ZVE32F-NEXT: sd t0, 0(a0)
+; RV64ZVE32F-NEXT: .LBB45_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB45_13
-; RV64ZVE32F-NEXT: .LBB45_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a0, a4, 16
; RV64ZVE32F-NEXT: bnez a0, .LBB45_14
-; RV64ZVE32F-NEXT: .LBB45_7: # %else8
+; RV64ZVE32F-NEXT: .LBB45_8: # %else8
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: beqz a0, .LBB45_9
-; RV64ZVE32F-NEXT: .LBB45_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: beqz a0, .LBB45_10
+; RV64ZVE32F-NEXT: .LBB45_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a5, 0(a0)
-; RV64ZVE32F-NEXT: .LBB45_9: # %else10
+; RV64ZVE32F-NEXT: .LBB45_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB45_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a0, a4, -128
; RV64ZVE32F-NEXT: bnez a0, .LBB45_16
-; RV64ZVE32F-NEXT: .LBB45_11: # %else14
+; RV64ZVE32F-NEXT: .LBB45_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB45_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
-; RV64ZVE32F-NEXT: slli a0, a0, 3
-; RV64ZVE32F-NEXT: add a0, a1, a0
-; RV64ZVE32F-NEXT: sd t0, 0(a0)
-; RV64ZVE32F-NEXT: andi a0, a4, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB45_6
; RV64ZVE32F-NEXT: .LBB45_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a7, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 16
-; RV64ZVE32F-NEXT: beqz a0, .LBB45_7
+; RV64ZVE32F-NEXT: beqz a0, .LBB45_8
; RV64ZVE32F-NEXT: .LBB45_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a6, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: bnez a0, .LBB45_8
-; RV64ZVE32F-NEXT: j .LBB45_9
+; RV64ZVE32F-NEXT: bnez a0, .LBB45_9
+; RV64ZVE32F-NEXT: j .LBB45_10
; RV64ZVE32F-NEXT: .LBB45_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, -128
-; RV64ZVE32F-NEXT: beqz a0, .LBB45_11
+; RV64ZVE32F-NEXT: beqz a0, .LBB45_12
; RV64ZVE32F-NEXT: .LBB45_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
@@ -4629,66 +4615,68 @@ define void @mscatter_baseidx_sext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd t1, 0(a0)
; RV64ZVE32F-NEXT: .LBB46_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a0, .LBB46_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB46_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: slli a0, a0, 3
+; RV64ZVE32F-NEXT: add a0, a1, a0
+; RV64ZVE32F-NEXT: sd t0, 0(a0)
+; RV64ZVE32F-NEXT: .LBB46_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB46_13
-; RV64ZVE32F-NEXT: .LBB46_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a0, a4, 16
; RV64ZVE32F-NEXT: bnez a0, .LBB46_14
-; RV64ZVE32F-NEXT: .LBB46_7: # %else8
+; RV64ZVE32F-NEXT: .LBB46_8: # %else8
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: beqz a0, .LBB46_9
-; RV64ZVE32F-NEXT: .LBB46_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: beqz a0, .LBB46_10
+; RV64ZVE32F-NEXT: .LBB46_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a5, 0(a0)
-; RV64ZVE32F-NEXT: .LBB46_9: # %else10
+; RV64ZVE32F-NEXT: .LBB46_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a0, a4, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB46_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a0, a4, -128
; RV64ZVE32F-NEXT: bnez a0, .LBB46_16
-; RV64ZVE32F-NEXT: .LBB46_11: # %else14
+; RV64ZVE32F-NEXT: .LBB46_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB46_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
-; RV64ZVE32F-NEXT: slli a0, a0, 3
-; RV64ZVE32F-NEXT: add a0, a1, a0
-; RV64ZVE32F-NEXT: sd t0, 0(a0)
-; RV64ZVE32F-NEXT: andi a0, a4, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB46_6
; RV64ZVE32F-NEXT: .LBB46_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a7, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 16
-; RV64ZVE32F-NEXT: beqz a0, .LBB46_7
+; RV64ZVE32F-NEXT: beqz a0, .LBB46_8
; RV64ZVE32F-NEXT: .LBB46_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a6, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, 32
-; RV64ZVE32F-NEXT: bnez a0, .LBB46_8
-; RV64ZVE32F-NEXT: j .LBB46_9
+; RV64ZVE32F-NEXT: bnez a0, .LBB46_9
+; RV64ZVE32F-NEXT: j .LBB46_10
; RV64ZVE32F-NEXT: .LBB46_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a4, -128
-; RV64ZVE32F-NEXT: beqz a0, .LBB46_11
+; RV64ZVE32F-NEXT: beqz a0, .LBB46_12
; RV64ZVE32F-NEXT: .LBB46_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
@@ -4879,63 +4867,65 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd t2, 0(a0)
; RV64ZVE32F-NEXT: .LBB47_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a0, a5, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a0, .LBB47_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a0, .LBB47_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: and a0, a0, a4
+; RV64ZVE32F-NEXT: slli a0, a0, 3
+; RV64ZVE32F-NEXT: add a0, a1, a0
+; RV64ZVE32F-NEXT: sd t1, 0(a0)
+; RV64ZVE32F-NEXT: .LBB47_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a0, a5, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a0, .LBB47_13
-; RV64ZVE32F-NEXT: .LBB47_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a0, a5, 16
; RV64ZVE32F-NEXT: bnez a0, .LBB47_14
-; RV64ZVE32F-NEXT: .LBB47_7: # %else8
+; RV64ZVE32F-NEXT: .LBB47_8: # %else8
; RV64ZVE32F-NEXT: andi a0, a5, 32
-; RV64ZVE32F-NEXT: beqz a0, .LBB47_9
-; RV64ZVE32F-NEXT: .LBB47_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: beqz a0, .LBB47_10
+; RV64ZVE32F-NEXT: .LBB47_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: and a0, a0, a4
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a6, 0(a0)
-; RV64ZVE32F-NEXT: .LBB47_9: # %else10
+; RV64ZVE32F-NEXT: .LBB47_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a0, a5, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a0, .LBB47_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a0, a5, -128
; RV64ZVE32F-NEXT: bnez a0, .LBB47_16
-; RV64ZVE32F-NEXT: .LBB47_11: # %else14
+; RV64ZVE32F-NEXT: .LBB47_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB47_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
-; RV64ZVE32F-NEXT: and a0, a0, a4
-; RV64ZVE32F-NEXT: slli a0, a0, 3
-; RV64ZVE32F-NEXT: add a0, a1, a0
-; RV64ZVE32F-NEXT: sd t1, 0(a0)
-; RV64ZVE32F-NEXT: andi a0, a5, 8
-; RV64ZVE32F-NEXT: beqz a0, .LBB47_6
; RV64ZVE32F-NEXT: .LBB47_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a0, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a0, v9
; RV64ZVE32F-NEXT: and a0, a0, a4
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd t0, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a5, 16
-; RV64ZVE32F-NEXT: beqz a0, .LBB47_7
+; RV64ZVE32F-NEXT: beqz a0, .LBB47_8
; RV64ZVE32F-NEXT: .LBB47_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a0, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: and a0, a0, a4
; RV64ZVE32F-NEXT: slli a0, a0, 3
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a7, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a5, 32
-; RV64ZVE32F-NEXT: bnez a0, .LBB47_8
-; RV64ZVE32F-NEXT: j .LBB47_9
+; RV64ZVE32F-NEXT: bnez a0, .LBB47_9
+; RV64ZVE32F-NEXT: j .LBB47_10
; RV64ZVE32F-NEXT: .LBB47_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
; RV64ZVE32F-NEXT: and a0, a0, a4
@@ -4943,7 +4933,7 @@ define void @mscatter_baseidx_zext_v8i16_v8i64(<8 x i64> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: add a0, a1, a0
; RV64ZVE32F-NEXT: sd a3, 0(a0)
; RV64ZVE32F-NEXT: andi a0, a5, -128
-; RV64ZVE32F-NEXT: beqz a0, .LBB47_11
+; RV64ZVE32F-NEXT: beqz a0, .LBB47_12
; RV64ZVE32F-NEXT: .LBB47_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a0, v8
@@ -6342,71 +6332,69 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB58_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB58_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB58_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT: vse16.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB58_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB58_13
-; RV64ZVE32F-NEXT: .LBB58_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB58_14
-; RV64ZVE32F-NEXT: .LBB58_7: # %else8
+; RV64ZVE32F-NEXT: .LBB58_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB58_9
-; RV64ZVE32F-NEXT: .LBB58_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB58_10
+; RV64ZVE32F-NEXT: .LBB58_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB58_9: # %else10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB58_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB58_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB58_16
-; RV64ZVE32F-NEXT: .LBB58_11: # %else14
+; RV64ZVE32F-NEXT: .LBB58_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB58_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v11, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB58_6
; RV64ZVE32F-NEXT: .LBB58_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB58_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB58_8
; RV64ZVE32F-NEXT: .LBB58_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB58_8
-; RV64ZVE32F-NEXT: j .LBB58_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB58_9
+; RV64ZVE32F-NEXT: j .LBB58_10
; RV64ZVE32F-NEXT: .LBB58_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -6415,7 +6403,7 @@ define void @mscatter_baseidx_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i8> %i
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB58_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB58_12
; RV64ZVE32F-NEXT: .LBB58_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
@@ -6475,71 +6463,69 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB59_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB59_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB59_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT: vse16.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB59_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB59_13
-; RV64ZVE32F-NEXT: .LBB59_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB59_14
-; RV64ZVE32F-NEXT: .LBB59_7: # %else8
+; RV64ZVE32F-NEXT: .LBB59_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB59_9
-; RV64ZVE32F-NEXT: .LBB59_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB59_10
+; RV64ZVE32F-NEXT: .LBB59_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB59_9: # %else10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB59_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB59_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB59_16
-; RV64ZVE32F-NEXT: .LBB59_11: # %else14
+; RV64ZVE32F-NEXT: .LBB59_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB59_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v11, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB59_6
; RV64ZVE32F-NEXT: .LBB59_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB59_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB59_8
; RV64ZVE32F-NEXT: .LBB59_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB59_8
-; RV64ZVE32F-NEXT: j .LBB59_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB59_9
+; RV64ZVE32F-NEXT: j .LBB59_10
; RV64ZVE32F-NEXT: .LBB59_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -6548,7 +6534,7 @@ define void @mscatter_baseidx_sext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB59_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB59_12
; RV64ZVE32F-NEXT: .LBB59_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
@@ -6611,75 +6597,73 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB60_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB60_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB60_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: andi a2, a2, 255
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT: vse16.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB60_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB60_13
-; RV64ZVE32F-NEXT: .LBB60_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB60_14
-; RV64ZVE32F-NEXT: .LBB60_7: # %else8
+; RV64ZVE32F-NEXT: .LBB60_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB60_9
-; RV64ZVE32F-NEXT: .LBB60_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB60_10
+; RV64ZVE32F-NEXT: .LBB60_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB60_9: # %else10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB60_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB60_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB60_16
-; RV64ZVE32F-NEXT: .LBB60_11: # %else14
+; RV64ZVE32F-NEXT: .LBB60_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB60_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: andi a2, a2, 255
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v11, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB60_6
; RV64ZVE32F-NEXT: .LBB60_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB60_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB60_8
; RV64ZVE32F-NEXT: .LBB60_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB60_8
-; RV64ZVE32F-NEXT: j .LBB60_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB60_9
+; RV64ZVE32F-NEXT: j .LBB60_10
; RV64ZVE32F-NEXT: .LBB60_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
@@ -6689,7 +6673,7 @@ define void @mscatter_baseidx_zext_v8i8_v8f16(<8 x half> %val, ptr %base, <8 x i
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB60_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB60_12
; RV64ZVE32F-NEXT: .LBB60_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
@@ -6750,70 +6734,68 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB61_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB61_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB61_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: slli a2, a2, 1
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
+; RV64ZVE32F-NEXT: vse16.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB61_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB61_13
-; RV64ZVE32F-NEXT: .LBB61_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB61_14
-; RV64ZVE32F-NEXT: .LBB61_7: # %else8
+; RV64ZVE32F-NEXT: .LBB61_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB61_9
-; RV64ZVE32F-NEXT: .LBB61_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB61_10
+; RV64ZVE32F-NEXT: .LBB61_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 5
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB61_9: # %else10
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 5
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB61_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB61_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB61_16
-; RV64ZVE32F-NEXT: .LBB61_11: # %else14
+; RV64ZVE32F-NEXT: .LBB61_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB61_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: slli a2, a2, 1
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 2
-; RV64ZVE32F-NEXT: vse16.v v11, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB61_6
; RV64ZVE32F-NEXT: .LBB61_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 3
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 3
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB61_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB61_8
; RV64ZVE32F-NEXT: .LBB61_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
-; RV64ZVE32F-NEXT: vse16.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 4
+; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB61_8
-; RV64ZVE32F-NEXT: j .LBB61_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB61_9
+; RV64ZVE32F-NEXT: j .LBB61_10
; RV64ZVE32F-NEXT: .LBB61_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 1
@@ -6822,7 +6804,7 @@ define void @mscatter_baseidx_v8f16(<8 x half> %val, ptr %base, <8 x i16> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 6
; RV64ZVE32F-NEXT: vse16.v v10, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB61_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB61_12
; RV64ZVE32F-NEXT: .LBB61_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
@@ -7196,71 +7178,69 @@ define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> %
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB68_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB68_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB68_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB68_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB68_13
-; RV64ZVE32F-NEXT: .LBB68_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB68_14
-; RV64ZVE32F-NEXT: .LBB68_7: # %else8
+; RV64ZVE32F-NEXT: .LBB68_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB68_9
-; RV64ZVE32F-NEXT: .LBB68_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB68_10
+; RV64ZVE32F-NEXT: .LBB68_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB68_9: # %else10
+; RV64ZVE32F-NEXT: .LBB68_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB68_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB68_16
-; RV64ZVE32F-NEXT: .LBB68_11: # %else14
+; RV64ZVE32F-NEXT: .LBB68_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB68_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB68_6
; RV64ZVE32F-NEXT: .LBB68_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB68_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB68_8
; RV64ZVE32F-NEXT: .LBB68_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB68_8
-; RV64ZVE32F-NEXT: j .LBB68_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB68_9
+; RV64ZVE32F-NEXT: j .LBB68_10
; RV64ZVE32F-NEXT: .LBB68_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -7269,7 +7249,7 @@ define void @mscatter_baseidx_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x i8> %
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB68_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB68_12
; RV64ZVE32F-NEXT: .LBB68_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -7328,71 +7308,69 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB69_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB69_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB69_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB69_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB69_13
-; RV64ZVE32F-NEXT: .LBB69_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB69_14
-; RV64ZVE32F-NEXT: .LBB69_7: # %else8
+; RV64ZVE32F-NEXT: .LBB69_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB69_9
-; RV64ZVE32F-NEXT: .LBB69_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB69_10
+; RV64ZVE32F-NEXT: .LBB69_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB69_9: # %else10
+; RV64ZVE32F-NEXT: .LBB69_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB69_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB69_16
-; RV64ZVE32F-NEXT: .LBB69_11: # %else14
+; RV64ZVE32F-NEXT: .LBB69_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB69_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB69_6
; RV64ZVE32F-NEXT: .LBB69_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB69_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB69_8
; RV64ZVE32F-NEXT: .LBB69_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB69_8
-; RV64ZVE32F-NEXT: j .LBB69_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB69_9
+; RV64ZVE32F-NEXT: j .LBB69_10
; RV64ZVE32F-NEXT: .LBB69_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -7401,7 +7379,7 @@ define void @mscatter_baseidx_sext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB69_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB69_12
; RV64ZVE32F-NEXT: .LBB69_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -7463,55 +7441,53 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB70_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB70_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB70_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: andi a2, a2, 255
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB70_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB70_13
-; RV64ZVE32F-NEXT: .LBB70_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB70_14
-; RV64ZVE32F-NEXT: .LBB70_7: # %else8
+; RV64ZVE32F-NEXT: .LBB70_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB70_9
-; RV64ZVE32F-NEXT: .LBB70_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB70_10
+; RV64ZVE32F-NEXT: .LBB70_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB70_9: # %else10
+; RV64ZVE32F-NEXT: .LBB70_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB70_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
-; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: bnez a1, .LBB70_16
-; RV64ZVE32F-NEXT: .LBB70_11: # %else14
-; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB70_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: andi a2, a2, 255
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB70_6
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
+; RV64ZVE32F-NEXT: andi a1, a1, -128
+; RV64ZVE32F-NEXT: bnez a1, .LBB70_16
+; RV64ZVE32F-NEXT: .LBB70_12: # %else14
+; RV64ZVE32F-NEXT: ret
; RV64ZVE32F-NEXT: .LBB70_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -7519,10 +7495,10 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB70_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB70_8
; RV64ZVE32F-NEXT: .LBB70_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -7530,8 +7506,8 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB70_8
-; RV64ZVE32F-NEXT: j .LBB70_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB70_9
+; RV64ZVE32F-NEXT: j .LBB70_10
; RV64ZVE32F-NEXT: .LBB70_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: andi a2, a2, 255
@@ -7541,7 +7517,7 @@ define void @mscatter_baseidx_zext_v8i8_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB70_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB70_12
; RV64ZVE32F-NEXT: .LBB70_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -7603,71 +7579,69 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16>
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB71_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB71_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB71_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB71_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB71_13
-; RV64ZVE32F-NEXT: .LBB71_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB71_14
-; RV64ZVE32F-NEXT: .LBB71_7: # %else8
+; RV64ZVE32F-NEXT: .LBB71_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB71_9
-; RV64ZVE32F-NEXT: .LBB71_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB71_10
+; RV64ZVE32F-NEXT: .LBB71_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB71_9: # %else10
+; RV64ZVE32F-NEXT: .LBB71_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB71_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB71_16
-; RV64ZVE32F-NEXT: .LBB71_11: # %else14
+; RV64ZVE32F-NEXT: .LBB71_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB71_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB71_6
; RV64ZVE32F-NEXT: .LBB71_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB71_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB71_8
; RV64ZVE32F-NEXT: .LBB71_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB71_8
-; RV64ZVE32F-NEXT: j .LBB71_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB71_9
+; RV64ZVE32F-NEXT: j .LBB71_10
; RV64ZVE32F-NEXT: .LBB71_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -7676,7 +7650,7 @@ define void @mscatter_baseidx_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x i16>
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB71_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB71_12
; RV64ZVE32F-NEXT: .LBB71_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -7736,71 +7710,69 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB72_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB72_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB72_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: slli a2, a2, 2
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB72_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB72_13
-; RV64ZVE32F-NEXT: .LBB72_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB72_14
-; RV64ZVE32F-NEXT: .LBB72_7: # %else8
+; RV64ZVE32F-NEXT: .LBB72_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB72_9
-; RV64ZVE32F-NEXT: .LBB72_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a2, .LBB72_10
+; RV64ZVE32F-NEXT: .LBB72_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB72_9: # %else10
+; RV64ZVE32F-NEXT: .LBB72_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB72_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB72_16
-; RV64ZVE32F-NEXT: .LBB72_11: # %else14
+; RV64ZVE32F-NEXT: .LBB72_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB72_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: slli a2, a2, 2
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB72_6
; RV64ZVE32F-NEXT: .LBB72_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB72_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB72_8
; RV64ZVE32F-NEXT: .LBB72_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB72_8
-; RV64ZVE32F-NEXT: j .LBB72_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB72_9
+; RV64ZVE32F-NEXT: j .LBB72_10
; RV64ZVE32F-NEXT: .LBB72_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: slli a2, a2, 2
@@ -7809,7 +7781,7 @@ define void @mscatter_baseidx_sext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB72_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB72_12
; RV64ZVE32F-NEXT: .LBB72_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -7874,55 +7846,53 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
; RV64ZVE32F-NEXT: .LBB73_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB73_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB73_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
+; RV64ZVE32F-NEXT: and a3, a3, a1
+; RV64ZVE32F-NEXT: slli a3, a3, 2
+; RV64ZVE32F-NEXT: add a3, a0, a3
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
+; RV64ZVE32F-NEXT: vse32.v v12, (a3)
+; RV64ZVE32F-NEXT: .LBB73_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB73_13
-; RV64ZVE32F-NEXT: .LBB73_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB73_14
-; RV64ZVE32F-NEXT: .LBB73_7: # %else8
+; RV64ZVE32F-NEXT: .LBB73_8: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB73_9
-; RV64ZVE32F-NEXT: .LBB73_8: # %cond.store9
+; RV64ZVE32F-NEXT: beqz a3, .LBB73_10
+; RV64ZVE32F-NEXT: .LBB73_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 5
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
-; RV64ZVE32F-NEXT: .LBB73_9: # %else10
+; RV64ZVE32F-NEXT: .LBB73_10: # %else10
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
; RV64ZVE32F-NEXT: bnez a3, .LBB73_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a2, a2, -128
; RV64ZVE32F-NEXT: bnez a2, .LBB73_16
-; RV64ZVE32F-NEXT: .LBB73_11: # %else14
+; RV64ZVE32F-NEXT: .LBB73_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB73_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a3, v10
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 2
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse32.v v12, (a3)
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB73_6
; RV64ZVE32F-NEXT: .LBB73_13: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v11
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
@@ -7930,10 +7900,10 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 3
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB73_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB73_8
; RV64ZVE32F-NEXT: .LBB73_14: # %cond.store7
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a3, v11
+; RV64ZVE32F-NEXT: vmv.x.s a3, v10
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 2
; RV64ZVE32F-NEXT: add a3, a0, a3
@@ -7941,8 +7911,8 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 4
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB73_8
-; RV64ZVE32F-NEXT: j .LBB73_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB73_9
+; RV64ZVE32F-NEXT: j .LBB73_10
; RV64ZVE32F-NEXT: .LBB73_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a3, v10
; RV64ZVE32F-NEXT: and a3, a3, a1
@@ -7952,7 +7922,7 @@ define void @mscatter_baseidx_zext_v8i16_v8f32(<8 x float> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 6
; RV64ZVE32F-NEXT: vse32.v v12, (a3)
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB73_11
+; RV64ZVE32F-NEXT: beqz a2, .LBB73_12
; RV64ZVE32F-NEXT: .LBB73_16: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
@@ -8707,66 +8677,68 @@ define void @mscatter_baseidx_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x i8>
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa1, 0(a2)
; RV64ZVE32F-NEXT: .LBB81_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB81_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB81_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 3
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
+; RV64ZVE32F-NEXT: .LBB81_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB81_13
-; RV64ZVE32F-NEXT: .LBB81_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB81_14
-; RV64ZVE32F-NEXT: .LBB81_7: # %else8
+; RV64ZVE32F-NEXT: .LBB81_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB81_9
-; RV64ZVE32F-NEXT: .LBB81_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: beqz a2, .LBB81_10
+; RV64ZVE32F-NEXT: .LBB81_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa5, 0(a2)
-; RV64ZVE32F-NEXT: .LBB81_9: # %else10
+; RV64ZVE32F-NEXT: .LBB81_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB81_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB81_16
-; RV64ZVE32F-NEXT: .LBB81_11: # %else14
+; RV64ZVE32F-NEXT: .LBB81_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB81_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 3
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB81_6
; RV64ZVE32F-NEXT: .LBB81_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa3, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB81_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB81_8
; RV64ZVE32F-NEXT: .LBB81_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa4, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB81_8
-; RV64ZVE32F-NEXT: j .LBB81_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB81_9
+; RV64ZVE32F-NEXT: j .LBB81_10
; RV64ZVE32F-NEXT: .LBB81_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa6, 0(a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB81_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB81_12
; RV64ZVE32F-NEXT: .LBB81_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
@@ -8907,66 +8879,68 @@ define void @mscatter_baseidx_sext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa1, 0(a2)
; RV64ZVE32F-NEXT: .LBB82_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB82_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB82_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 3
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
+; RV64ZVE32F-NEXT: .LBB82_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB82_13
-; RV64ZVE32F-NEXT: .LBB82_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB82_14
-; RV64ZVE32F-NEXT: .LBB82_7: # %else8
+; RV64ZVE32F-NEXT: .LBB82_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB82_9
-; RV64ZVE32F-NEXT: .LBB82_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: beqz a2, .LBB82_10
+; RV64ZVE32F-NEXT: .LBB82_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa5, 0(a2)
-; RV64ZVE32F-NEXT: .LBB82_9: # %else10
+; RV64ZVE32F-NEXT: .LBB82_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB82_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB82_16
-; RV64ZVE32F-NEXT: .LBB82_11: # %else14
+; RV64ZVE32F-NEXT: .LBB82_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB82_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 3
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB82_6
; RV64ZVE32F-NEXT: .LBB82_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa3, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB82_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB82_8
; RV64ZVE32F-NEXT: .LBB82_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa4, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB82_8
-; RV64ZVE32F-NEXT: j .LBB82_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB82_9
+; RV64ZVE32F-NEXT: j .LBB82_10
; RV64ZVE32F-NEXT: .LBB82_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa6, 0(a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB82_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB82_12
; RV64ZVE32F-NEXT: .LBB82_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
@@ -9110,63 +9084,65 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa1, 0(a2)
; RV64ZVE32F-NEXT: .LBB83_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB83_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB83_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: andi a2, a2, 255
+; RV64ZVE32F-NEXT: slli a2, a2, 3
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
+; RV64ZVE32F-NEXT: .LBB83_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB83_13
-; RV64ZVE32F-NEXT: .LBB83_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB83_14
-; RV64ZVE32F-NEXT: .LBB83_7: # %else8
+; RV64ZVE32F-NEXT: .LBB83_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB83_9
-; RV64ZVE32F-NEXT: .LBB83_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: beqz a2, .LBB83_10
+; RV64ZVE32F-NEXT: .LBB83_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa5, 0(a2)
-; RV64ZVE32F-NEXT: .LBB83_9: # %else10
+; RV64ZVE32F-NEXT: .LBB83_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB83_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB83_16
-; RV64ZVE32F-NEXT: .LBB83_11: # %else14
+; RV64ZVE32F-NEXT: .LBB83_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB83_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: andi a2, a2, 255
-; RV64ZVE32F-NEXT: slli a2, a2, 3
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB83_6
; RV64ZVE32F-NEXT: .LBB83_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa3, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB83_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB83_8
; RV64ZVE32F-NEXT: .LBB83_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa4, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB83_8
-; RV64ZVE32F-NEXT: j .LBB83_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB83_9
+; RV64ZVE32F-NEXT: j .LBB83_10
; RV64ZVE32F-NEXT: .LBB83_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: andi a2, a2, 255
@@ -9174,7 +9150,7 @@ define void @mscatter_baseidx_zext_v8i8_v8f64(<8 x double> %val, ptr %base, <8 x
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa6, 0(a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB83_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB83_12
; RV64ZVE32F-NEXT: .LBB83_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
@@ -9318,66 +9294,68 @@ define void @mscatter_baseidx_v8i16_v8f64(<8 x double> %val, ptr %base, <8 x i16
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa1, 0(a2)
; RV64ZVE32F-NEXT: .LBB84_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB84_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB84_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 3
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
+; RV64ZVE32F-NEXT: .LBB84_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB84_13
-; RV64ZVE32F-NEXT: .LBB84_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB84_14
-; RV64ZVE32F-NEXT: .LBB84_7: # %else8
+; RV64ZVE32F-NEXT: .LBB84_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB84_9
-; RV64ZVE32F-NEXT: .LBB84_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: beqz a2, .LBB84_10
+; RV64ZVE32F-NEXT: .LBB84_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa5, 0(a2)
-; RV64ZVE32F-NEXT: .LBB84_9: # %else10
+; RV64ZVE32F-NEXT: .LBB84_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB84_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB84_16
-; RV64ZVE32F-NEXT: .LBB84_11: # %else14
+; RV64ZVE32F-NEXT: .LBB84_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB84_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 3
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB84_6
; RV64ZVE32F-NEXT: .LBB84_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa3, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB84_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB84_8
; RV64ZVE32F-NEXT: .LBB84_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa4, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB84_8
-; RV64ZVE32F-NEXT: j .LBB84_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB84_9
+; RV64ZVE32F-NEXT: j .LBB84_10
; RV64ZVE32F-NEXT: .LBB84_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa6, 0(a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB84_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB84_12
; RV64ZVE32F-NEXT: .LBB84_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
@@ -9519,66 +9497,68 @@ define void @mscatter_baseidx_sext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa1, 0(a2)
; RV64ZVE32F-NEXT: .LBB85_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB85_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB85_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: slli a2, a2, 3
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
+; RV64ZVE32F-NEXT: .LBB85_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB85_13
-; RV64ZVE32F-NEXT: .LBB85_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
; RV64ZVE32F-NEXT: bnez a2, .LBB85_14
-; RV64ZVE32F-NEXT: .LBB85_7: # %else8
+; RV64ZVE32F-NEXT: .LBB85_8: # %else8
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: beqz a2, .LBB85_9
-; RV64ZVE32F-NEXT: .LBB85_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: beqz a2, .LBB85_10
+; RV64ZVE32F-NEXT: .LBB85_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa5, 0(a2)
-; RV64ZVE32F-NEXT: .LBB85_9: # %else10
+; RV64ZVE32F-NEXT: .LBB85_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a2, .LBB85_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a1, a1, -128
; RV64ZVE32F-NEXT: bnez a1, .LBB85_16
-; RV64ZVE32F-NEXT: .LBB85_11: # %else14
+; RV64ZVE32F-NEXT: .LBB85_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB85_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
-; RV64ZVE32F-NEXT: slli a2, a2, 3
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: fsd fa2, 0(a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB85_6
; RV64ZVE32F-NEXT: .LBB85_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa3, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB85_7
+; RV64ZVE32F-NEXT: beqz a2, .LBB85_8
; RV64ZVE32F-NEXT: .LBB85_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa4, 0(a2)
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: bnez a2, .LBB85_8
-; RV64ZVE32F-NEXT: j .LBB85_9
+; RV64ZVE32F-NEXT: bnez a2, .LBB85_9
+; RV64ZVE32F-NEXT: j .LBB85_10
; RV64ZVE32F-NEXT: .LBB85_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
; RV64ZVE32F-NEXT: slli a2, a2, 3
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: fsd fa6, 0(a2)
; RV64ZVE32F-NEXT: andi a1, a1, -128
-; RV64ZVE32F-NEXT: beqz a1, .LBB85_11
+; RV64ZVE32F-NEXT: beqz a1, .LBB85_12
; RV64ZVE32F-NEXT: .LBB85_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v8
@@ -9725,63 +9705,65 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: fsd fa1, 0(a3)
; RV64ZVE32F-NEXT: .LBB86_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 4
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
-; RV64ZVE32F-NEXT: bnez a3, .LBB86_12
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 2
+; RV64ZVE32F-NEXT: beqz a3, .LBB86_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: and a3, a3, a1
+; RV64ZVE32F-NEXT: slli a3, a3, 3
+; RV64ZVE32F-NEXT: add a3, a0, a3
+; RV64ZVE32F-NEXT: fsd fa2, 0(a3)
+; RV64ZVE32F-NEXT: .LBB86_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e16, m1, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 4
; RV64ZVE32F-NEXT: bnez a3, .LBB86_13
-; RV64ZVE32F-NEXT: .LBB86_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a3, a2, 16
; RV64ZVE32F-NEXT: bnez a3, .LBB86_14
-; RV64ZVE32F-NEXT: .LBB86_7: # %else8
+; RV64ZVE32F-NEXT: .LBB86_8: # %else8
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: beqz a3, .LBB86_9
-; RV64ZVE32F-NEXT: .LBB86_8: # %cond.store9
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: beqz a3, .LBB86_10
+; RV64ZVE32F-NEXT: .LBB86_9: # %cond.store9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: fsd fa5, 0(a3)
-; RV64ZVE32F-NEXT: .LBB86_9: # %else10
+; RV64ZVE32F-NEXT: .LBB86_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 2, e16, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a3, a2, 64
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v9, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 2
; RV64ZVE32F-NEXT: bnez a3, .LBB86_15
-; RV64ZVE32F-NEXT: # %bb.10: # %else12
+; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a2, a2, -128
; RV64ZVE32F-NEXT: bnez a2, .LBB86_16
-; RV64ZVE32F-NEXT: .LBB86_11: # %else14
+; RV64ZVE32F-NEXT: .LBB86_12: # %else14
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB86_12: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
-; RV64ZVE32F-NEXT: and a3, a3, a1
-; RV64ZVE32F-NEXT: slli a3, a3, 3
-; RV64ZVE32F-NEXT: add a3, a0, a3
-; RV64ZVE32F-NEXT: fsd fa2, 0(a3)
-; RV64ZVE32F-NEXT: andi a3, a2, 8
-; RV64ZVE32F-NEXT: beqz a3, .LBB86_6
; RV64ZVE32F-NEXT: .LBB86_13: # %cond.store5
-; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
-; RV64ZVE32F-NEXT: vmv.x.s a3, v8
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a3, v9
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: fsd fa3, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 16
-; RV64ZVE32F-NEXT: beqz a3, .LBB86_7
+; RV64ZVE32F-NEXT: beqz a3, .LBB86_8
; RV64ZVE32F-NEXT: .LBB86_14: # %cond.store7
-; RV64ZVE32F-NEXT: vmv.x.s a3, v9
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: and a3, a3, a1
; RV64ZVE32F-NEXT: slli a3, a3, 3
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: fsd fa4, 0(a3)
; RV64ZVE32F-NEXT: andi a3, a2, 32
-; RV64ZVE32F-NEXT: bnez a3, .LBB86_8
-; RV64ZVE32F-NEXT: j .LBB86_9
+; RV64ZVE32F-NEXT: bnez a3, .LBB86_9
+; RV64ZVE32F-NEXT: j .LBB86_10
; RV64ZVE32F-NEXT: .LBB86_15: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a3, v8
; RV64ZVE32F-NEXT: and a3, a3, a1
@@ -9789,7 +9771,7 @@ define void @mscatter_baseidx_zext_v8i16_v8f64(<8 x double> %val, ptr %base, <8
; RV64ZVE32F-NEXT: add a3, a0, a3
; RV64ZVE32F-NEXT: fsd fa6, 0(a3)
; RV64ZVE32F-NEXT: andi a2, a2, -128
-; RV64ZVE32F-NEXT: beqz a2, .LBB86_11
+; RV64ZVE32F-NEXT: beqz a2, .LBB86_12
; RV64ZVE32F-NEXT: .LBB86_16: # %cond.store13
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v8
@@ -10663,30 +10645,28 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 1
; RV64ZVE32F-NEXT: vse8.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB91_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
; RV64ZVE32F-NEXT: vslidedown.vi v11, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB91_25
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: beqz a2, .LBB91_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 2
+; RV64ZVE32F-NEXT: vse8.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB91_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB91_26
-; RV64ZVE32F-NEXT: .LBB91_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB91_8
-; RV64ZVE32F-NEXT: .LBB91_7: # %cond.store7
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 4
-; RV64ZVE32F-NEXT: vse8.v v11, (a2)
+; RV64ZVE32F-NEXT: bnez a2, .LBB91_27
; RV64ZVE32F-NEXT: .LBB91_8: # %else8
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB91_10
-; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9
+; RV64ZVE32F-NEXT: .LBB91_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v11
@@ -10695,16 +10675,18 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 5
; RV64ZVE32F-NEXT: vse8.v v11, (a2)
; RV64ZVE32F-NEXT: .LBB91_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 8
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB91_27
+; RV64ZVE32F-NEXT: bnez a2, .LBB91_28
; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a2, a1, 128
-; RV64ZVE32F-NEXT: bnez a2, .LBB91_28
+; RV64ZVE32F-NEXT: bnez a2, .LBB91_29
; RV64ZVE32F-NEXT: .LBB91_12: # %else14
; RV64ZVE32F-NEXT: andi a2, a1, 256
-; RV64ZVE32F-NEXT: bnez a2, .LBB91_29
+; RV64ZVE32F-NEXT: bnez a2, .LBB91_30
; RV64ZVE32F-NEXT: .LBB91_13: # %else16
; RV64ZVE32F-NEXT: andi a2, a1, 512
; RV64ZVE32F-NEXT: beqz a2, .LBB91_15
@@ -10717,45 +10699,51 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 9
; RV64ZVE32F-NEXT: vse8.v v10, (a2)
; RV64ZVE32F-NEXT: .LBB91_15: # %else18
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 1024
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB91_30
-; RV64ZVE32F-NEXT: # %bb.16: # %else20
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 2
+; RV64ZVE32F-NEXT: beqz a2, .LBB91_17
+; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 10
+; RV64ZVE32F-NEXT: vse8.v v11, (a2)
+; RV64ZVE32F-NEXT: .LBB91_17: # %else20
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 52
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 4
; RV64ZVE32F-NEXT: bltz a2, .LBB91_31
-; RV64ZVE32F-NEXT: .LBB91_17: # %else22
+; RV64ZVE32F-NEXT: # %bb.18: # %else22
; RV64ZVE32F-NEXT: slli a2, a1, 51
; RV64ZVE32F-NEXT: bltz a2, .LBB91_32
-; RV64ZVE32F-NEXT: .LBB91_18: # %else24
+; RV64ZVE32F-NEXT: .LBB91_19: # %else24
; RV64ZVE32F-NEXT: slli a2, a1, 50
-; RV64ZVE32F-NEXT: bgez a2, .LBB91_20
-; RV64ZVE32F-NEXT: .LBB91_19: # %cond.store25
+; RV64ZVE32F-NEXT: bgez a2, .LBB91_21
+; RV64ZVE32F-NEXT: .LBB91_20: # %cond.store25
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v9, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 13
-; RV64ZVE32F-NEXT: vse8.v v9, (a2)
-; RV64ZVE32F-NEXT: .LBB91_20: # %else26
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 13
+; RV64ZVE32F-NEXT: vse8.v v10, (a2)
+; RV64ZVE32F-NEXT: .LBB91_21: # %else26
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 49
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v10, 2
-; RV64ZVE32F-NEXT: bgez a2, .LBB91_22
-; RV64ZVE32F-NEXT: # %bb.21: # %cond.store27
+; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 2
+; RV64ZVE32F-NEXT: bgez a2, .LBB91_23
+; RV64ZVE32F-NEXT: # %bb.22: # %cond.store27
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 14
; RV64ZVE32F-NEXT: vse8.v v10, (a2)
-; RV64ZVE32F-NEXT: .LBB91_22: # %else28
+; RV64ZVE32F-NEXT: .LBB91_23: # %else28
; RV64ZVE32F-NEXT: lui a2, 1048568
; RV64ZVE32F-NEXT: and a1, a1, a2
-; RV64ZVE32F-NEXT: beqz a1, .LBB91_24
-; RV64ZVE32F-NEXT: # %bb.23: # %cond.store29
+; RV64ZVE32F-NEXT: beqz a1, .LBB91_25
+; RV64ZVE32F-NEXT: # %bb.24: # %cond.store29
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v9
@@ -10763,16 +10751,8 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 15
; RV64ZVE32F-NEXT: vse8.v v8, (a0)
-; RV64ZVE32F-NEXT: .LBB91_24: # %else30
+; RV64ZVE32F-NEXT: .LBB91_25: # %else30
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB91_25: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 2
-; RV64ZVE32F-NEXT: vse8.v v12, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB91_6
; RV64ZVE32F-NEXT: .LBB91_26: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
@@ -10782,9 +10762,17 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 3
; RV64ZVE32F-NEXT: vse8.v v11, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: bnez a2, .LBB91_7
-; RV64ZVE32F-NEXT: j .LBB91_8
-; RV64ZVE32F-NEXT: .LBB91_27: # %cond.store11
+; RV64ZVE32F-NEXT: beqz a2, .LBB91_8
+; RV64ZVE32F-NEXT: .LBB91_27: # %cond.store7
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 4
+; RV64ZVE32F-NEXT: vse8.v v11, (a2)
+; RV64ZVE32F-NEXT: andi a2, a1, 32
+; RV64ZVE32F-NEXT: bnez a2, .LBB91_9
+; RV64ZVE32F-NEXT: j .LBB91_10
+; RV64ZVE32F-NEXT: .LBB91_28: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
@@ -10792,7 +10780,7 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
; RV64ZVE32F-NEXT: vse8.v v11, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 128
; RV64ZVE32F-NEXT: beqz a2, .LBB91_12
-; RV64ZVE32F-NEXT: .LBB91_28: # %cond.store13
+; RV64ZVE32F-NEXT: .LBB91_29: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
@@ -10802,7 +10790,7 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
; RV64ZVE32F-NEXT: vse8.v v10, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 256
; RV64ZVE32F-NEXT: beqz a2, .LBB91_13
-; RV64ZVE32F-NEXT: .LBB91_29: # %cond.store15
+; RV64ZVE32F-NEXT: .LBB91_30: # %cond.store15
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -10811,33 +10799,25 @@ define void @mscatter_baseidx_v16i8(<16 x i8> %val, ptr %base, <16 x i8> %idxs,
; RV64ZVE32F-NEXT: andi a2, a1, 512
; RV64ZVE32F-NEXT: bnez a2, .LBB91_14
; RV64ZVE32F-NEXT: j .LBB91_15
-; RV64ZVE32F-NEXT: .LBB91_30: # %cond.store19
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v8, 10
-; RV64ZVE32F-NEXT: vse8.v v11, (a2)
-; RV64ZVE32F-NEXT: slli a2, a1, 52
-; RV64ZVE32F-NEXT: bgez a2, .LBB91_17
; RV64ZVE32F-NEXT: .LBB91_31: # %cond.store21
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v9, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v9
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 11
-; RV64ZVE32F-NEXT: vse8.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 11
+; RV64ZVE32F-NEXT: vse8.v v10, (a2)
; RV64ZVE32F-NEXT: slli a2, a1, 51
-; RV64ZVE32F-NEXT: bgez a2, .LBB91_18
+; RV64ZVE32F-NEXT: bgez a2, .LBB91_19
; RV64ZVE32F-NEXT: .LBB91_32: # %cond.store23
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m1, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vmv.x.s a2, v9
; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vslidedown.vi v9, v8, 12
-; RV64ZVE32F-NEXT: vse8.v v9, (a2)
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v8, 12
+; RV64ZVE32F-NEXT: vse8.v v10, (a2)
; RV64ZVE32F-NEXT: slli a2, a1, 50
-; RV64ZVE32F-NEXT: bltz a2, .LBB91_19
-; RV64ZVE32F-NEXT: j .LBB91_20
+; RV64ZVE32F-NEXT: bltz a2, .LBB91_20
+; RV64ZVE32F-NEXT: j .LBB91_21
%ptrs = getelementptr inbounds i8, ptr %base, <16 x i8> %idxs
call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %val, <16 x ptr> %ptrs, i32 1, <16 x i1> %m)
ret void
@@ -10896,48 +10876,48 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 1
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB92_4: # %else2
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 4
; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB92_49
-; RV64ZVE32F-NEXT: # %bb.5: # %else4
+; RV64ZVE32F-NEXT: beqz a2, .LBB92_6
+; RV64ZVE32F-NEXT: # %bb.5: # %cond.store3
+; RV64ZVE32F-NEXT: vmv.x.s a2, v12
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2
+; RV64ZVE32F-NEXT: vse8.v v14, (a2)
+; RV64ZVE32F-NEXT: .LBB92_6: # %else4
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 8
+; RV64ZVE32F-NEXT: vslidedown.vi v13, v10, 4
; RV64ZVE32F-NEXT: bnez a2, .LBB92_50
-; RV64ZVE32F-NEXT: .LBB92_6: # %else6
+; RV64ZVE32F-NEXT: # %bb.7: # %else6
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: beqz a2, .LBB92_8
-; RV64ZVE32F-NEXT: .LBB92_7: # %cond.store7
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v13
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 4
-; RV64ZVE32F-NEXT: vse8.v v14, (a2)
+; RV64ZVE32F-NEXT: bnez a2, .LBB92_51
; RV64ZVE32F-NEXT: .LBB92_8: # %else8
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 32
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 8
; RV64ZVE32F-NEXT: beqz a2, .LBB92_10
-; RV64ZVE32F-NEXT: # %bb.9: # %cond.store9
+; RV64ZVE32F-NEXT: .LBB92_9: # %cond.store9
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v14, v13, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v14
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v13, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v12
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 5
; RV64ZVE32F-NEXT: vse8.v v14, (a2)
; RV64ZVE32F-NEXT: .LBB92_10: # %else10
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 8
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 64
; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 2
-; RV64ZVE32F-NEXT: bnez a2, .LBB92_51
+; RV64ZVE32F-NEXT: bnez a2, .LBB92_52
; RV64ZVE32F-NEXT: # %bb.11: # %else12
; RV64ZVE32F-NEXT: andi a2, a1, 128
-; RV64ZVE32F-NEXT: bnez a2, .LBB92_52
+; RV64ZVE32F-NEXT: bnez a2, .LBB92_53
; RV64ZVE32F-NEXT: .LBB92_12: # %else14
; RV64ZVE32F-NEXT: andi a2, a1, 256
-; RV64ZVE32F-NEXT: bnez a2, .LBB92_53
+; RV64ZVE32F-NEXT: bnez a2, .LBB92_54
; RV64ZVE32F-NEXT: .LBB92_13: # %else16
; RV64ZVE32F-NEXT: andi a2, a1, 512
; RV64ZVE32F-NEXT: beqz a2, .LBB92_15
@@ -10950,25 +10930,25 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 9
; RV64ZVE32F-NEXT: vse8.v v14, (a2)
; RV64ZVE32F-NEXT: .LBB92_15: # %else18
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: andi a2, a1, 1024
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 2
+; RV64ZVE32F-NEXT: vslidedown.vi v13, v12, 2
; RV64ZVE32F-NEXT: beqz a2, .LBB92_17
; RV64ZVE32F-NEXT: # %bb.16: # %cond.store19
-; RV64ZVE32F-NEXT: vmv.x.s a2, v12
+; RV64ZVE32F-NEXT: vmv.x.s a2, v13
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 10
; RV64ZVE32F-NEXT: vse8.v v14, (a2)
; RV64ZVE32F-NEXT: .LBB92_17: # %else20
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 52
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 4
; RV64ZVE32F-NEXT: bgez a2, .LBB92_19
; RV64ZVE32F-NEXT: # %bb.18: # %cond.store21
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v12
+; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v13
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 11
@@ -10979,7 +10959,7 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 16
; RV64ZVE32F-NEXT: bgez a2, .LBB92_21
; RV64ZVE32F-NEXT: # %bb.20: # %cond.store23
-; RV64ZVE32F-NEXT: vmv.x.s a2, v13
+; RV64ZVE32F-NEXT: vmv.x.s a2, v12
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 12
@@ -10989,7 +10969,7 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: bgez a2, .LBB92_23
; RV64ZVE32F-NEXT: # %bb.22: # %cond.store25
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v13, 1
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
@@ -10998,14 +10978,14 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: .LBB92_23: # %else26
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 49
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v13, 2
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_54
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v12, 2
+; RV64ZVE32F-NEXT: bltz a2, .LBB92_55
; RV64ZVE32F-NEXT: # %bb.24: # %else28
; RV64ZVE32F-NEXT: slli a2, a1, 48
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_55
+; RV64ZVE32F-NEXT: bltz a2, .LBB92_56
; RV64ZVE32F-NEXT: .LBB92_25: # %else30
; RV64ZVE32F-NEXT: slli a2, a1, 47
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_56
+; RV64ZVE32F-NEXT: bltz a2, .LBB92_57
; RV64ZVE32F-NEXT: .LBB92_26: # %else32
; RV64ZVE32F-NEXT: slli a2, a1, 46
; RV64ZVE32F-NEXT: bgez a2, .LBB92_28
@@ -11018,30 +10998,28 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 17
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB92_28: # %else34
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 45
; RV64ZVE32F-NEXT: vslidedown.vi v12, v10, 2
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_57
-; RV64ZVE32F-NEXT: # %bb.29: # %else36
+; RV64ZVE32F-NEXT: bgez a2, .LBB92_30
+; RV64ZVE32F-NEXT: # %bb.29: # %cond.store35
+; RV64ZVE32F-NEXT: vmv.x.s a2, v12
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 18
+; RV64ZVE32F-NEXT: vse8.v v14, (a2)
+; RV64ZVE32F-NEXT: .LBB92_30: # %else36
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 44
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: bltz a2, .LBB92_58
-; RV64ZVE32F-NEXT: .LBB92_30: # %else38
+; RV64ZVE32F-NEXT: # %bb.31: # %else38
; RV64ZVE32F-NEXT: slli a2, a1, 43
-; RV64ZVE32F-NEXT: bgez a2, .LBB92_32
-; RV64ZVE32F-NEXT: .LBB92_31: # %cond.store39
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 20
-; RV64ZVE32F-NEXT: vse8.v v12, (a2)
+; RV64ZVE32F-NEXT: bltz a2, .LBB92_59
; RV64ZVE32F-NEXT: .LBB92_32: # %else40
-; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 42
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 8
; RV64ZVE32F-NEXT: bgez a2, .LBB92_34
-; RV64ZVE32F-NEXT: # %bb.33: # %cond.store41
+; RV64ZVE32F-NEXT: .LBB92_33: # %cond.store41
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v11, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v12
@@ -11050,16 +11028,18 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 21
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB92_34: # %else42
+; RV64ZVE32F-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 8
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 41
; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 2
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_59
+; RV64ZVE32F-NEXT: bltz a2, .LBB92_60
; RV64ZVE32F-NEXT: # %bb.35: # %else44
; RV64ZVE32F-NEXT: slli a2, a1, 40
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_60
+; RV64ZVE32F-NEXT: bltz a2, .LBB92_61
; RV64ZVE32F-NEXT: .LBB92_36: # %else46
; RV64ZVE32F-NEXT: slli a2, a1, 39
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_61
+; RV64ZVE32F-NEXT: bltz a2, .LBB92_62
; RV64ZVE32F-NEXT: .LBB92_37: # %else48
; RV64ZVE32F-NEXT: slli a2, a1, 38
; RV64ZVE32F-NEXT: bgez a2, .LBB92_39
@@ -11072,45 +11052,51 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 25
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: .LBB92_39: # %else50
-; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 4
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 37
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_62
-; RV64ZVE32F-NEXT: # %bb.40: # %else52
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 2
+; RV64ZVE32F-NEXT: bgez a2, .LBB92_41
+; RV64ZVE32F-NEXT: # %bb.40: # %cond.store51
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 26
+; RV64ZVE32F-NEXT: vse8.v v12, (a2)
+; RV64ZVE32F-NEXT: .LBB92_41: # %else52
+; RV64ZVE32F-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 36
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 4
; RV64ZVE32F-NEXT: bltz a2, .LBB92_63
-; RV64ZVE32F-NEXT: .LBB92_41: # %else54
+; RV64ZVE32F-NEXT: # %bb.42: # %else54
; RV64ZVE32F-NEXT: slli a2, a1, 35
; RV64ZVE32F-NEXT: bltz a2, .LBB92_64
-; RV64ZVE32F-NEXT: .LBB92_42: # %else56
+; RV64ZVE32F-NEXT: .LBB92_43: # %else56
; RV64ZVE32F-NEXT: slli a2, a1, 34
-; RV64ZVE32F-NEXT: bgez a2, .LBB92_44
-; RV64ZVE32F-NEXT: .LBB92_43: # %cond.store57
+; RV64ZVE32F-NEXT: bgez a2, .LBB92_45
+; RV64ZVE32F-NEXT: .LBB92_44: # %cond.store57
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v10, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 29
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB92_44: # %else58
+; RV64ZVE32F-NEXT: .LBB92_45: # %else58
; RV64ZVE32F-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: slli a2, a1, 33
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v11, 2
-; RV64ZVE32F-NEXT: bgez a2, .LBB92_46
-; RV64ZVE32F-NEXT: # %bb.45: # %cond.store59
+; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 2
+; RV64ZVE32F-NEXT: bgez a2, .LBB92_47
+; RV64ZVE32F-NEXT: # %bb.46: # %cond.store59
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 30
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
-; RV64ZVE32F-NEXT: .LBB92_46: # %else60
+; RV64ZVE32F-NEXT: .LBB92_47: # %else60
; RV64ZVE32F-NEXT: lui a2, 524288
; RV64ZVE32F-NEXT: and a1, a1, a2
-; RV64ZVE32F-NEXT: beqz a1, .LBB92_48
-; RV64ZVE32F-NEXT: # %bb.47: # %cond.store61
+; RV64ZVE32F-NEXT: beqz a1, .LBB92_49
+; RV64ZVE32F-NEXT: # %bb.48: # %cond.store61
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
; RV64ZVE32F-NEXT: vmv.x.s a1, v10
@@ -11118,16 +11104,8 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v8, v8, 31
; RV64ZVE32F-NEXT: vse8.v v8, (a0)
-; RV64ZVE32F-NEXT: .LBB92_48: # %else62
+; RV64ZVE32F-NEXT: .LBB92_49: # %else62
; RV64ZVE32F-NEXT: ret
-; RV64ZVE32F-NEXT: .LBB92_49: # %cond.store3
-; RV64ZVE32F-NEXT: vmv.x.s a2, v12
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 2
-; RV64ZVE32F-NEXT: vse8.v v14, (a2)
-; RV64ZVE32F-NEXT: andi a2, a1, 8
-; RV64ZVE32F-NEXT: beqz a2, .LBB92_6
; RV64ZVE32F-NEXT: .LBB92_50: # %cond.store5
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1
@@ -11137,9 +11115,17 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 3
; RV64ZVE32F-NEXT: vse8.v v14, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 16
-; RV64ZVE32F-NEXT: bnez a2, .LBB92_7
-; RV64ZVE32F-NEXT: j .LBB92_8
-; RV64ZVE32F-NEXT: .LBB92_51: # %cond.store11
+; RV64ZVE32F-NEXT: beqz a2, .LBB92_8
+; RV64ZVE32F-NEXT: .LBB92_51: # %cond.store7
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v13
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 4
+; RV64ZVE32F-NEXT: vse8.v v14, (a2)
+; RV64ZVE32F-NEXT: andi a2, a1, 32
+; RV64ZVE32F-NEXT: bnez a2, .LBB92_9
+; RV64ZVE32F-NEXT: j .LBB92_10
+; RV64ZVE32F-NEXT: .LBB92_52: # %cond.store11
; RV64ZVE32F-NEXT: vmv.x.s a2, v13
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
@@ -11147,7 +11133,7 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vse8.v v14, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 128
; RV64ZVE32F-NEXT: beqz a2, .LBB92_12
-; RV64ZVE32F-NEXT: .LBB92_52: # %cond.store13
+; RV64ZVE32F-NEXT: .LBB92_53: # %cond.store13
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v13, v13, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v13
@@ -11157,7 +11143,7 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vse8.v v14, (a2)
; RV64ZVE32F-NEXT: andi a2, a1, 256
; RV64ZVE32F-NEXT: beqz a2, .LBB92_13
-; RV64ZVE32F-NEXT: .LBB92_53: # %cond.store15
+; RV64ZVE32F-NEXT: .LBB92_54: # %cond.store15
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vmv.x.s a2, v12
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -11166,7 +11152,7 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: andi a2, a1, 512
; RV64ZVE32F-NEXT: bnez a2, .LBB92_14
; RV64ZVE32F-NEXT: j .LBB92_15
-; RV64ZVE32F-NEXT: .LBB92_54: # %cond.store27
+; RV64ZVE32F-NEXT: .LBB92_55: # %cond.store27
; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
@@ -11174,7 +11160,7 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: slli a2, a1, 48
; RV64ZVE32F-NEXT: bgez a2, .LBB92_25
-; RV64ZVE32F-NEXT: .LBB92_55: # %cond.store29
+; RV64ZVE32F-NEXT: .LBB92_56: # %cond.store29
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v11
@@ -11184,7 +11170,7 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: slli a2, a1, 47
; RV64ZVE32F-NEXT: bgez a2, .LBB92_26
-; RV64ZVE32F-NEXT: .LBB92_56: # %cond.store31
+; RV64ZVE32F-NEXT: .LBB92_57: # %cond.store31
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -11193,14 +11179,6 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: slli a2, a1, 46
; RV64ZVE32F-NEXT: bltz a2, .LBB92_27
; RV64ZVE32F-NEXT: j .LBB92_28
-; RV64ZVE32F-NEXT: .LBB92_57: # %cond.store35
-; RV64ZVE32F-NEXT: vmv.x.s a2, v12
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v14, v8, 18
-; RV64ZVE32F-NEXT: vse8.v v14, (a2)
-; RV64ZVE32F-NEXT: slli a2, a1, 44
-; RV64ZVE32F-NEXT: bgez a2, .LBB92_30
; RV64ZVE32F-NEXT: .LBB92_58: # %cond.store37
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v12, 1
@@ -11210,9 +11188,17 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 19
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: slli a2, a1, 43
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_31
-; RV64ZVE32F-NEXT: j .LBB92_32
-; RV64ZVE32F-NEXT: .LBB92_59: # %cond.store43
+; RV64ZVE32F-NEXT: bgez a2, .LBB92_32
+; RV64ZVE32F-NEXT: .LBB92_59: # %cond.store39
+; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: add a2, a0, a2
+; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 20
+; RV64ZVE32F-NEXT: vse8.v v12, (a2)
+; RV64ZVE32F-NEXT: slli a2, a1, 42
+; RV64ZVE32F-NEXT: bltz a2, .LBB92_33
+; RV64ZVE32F-NEXT: j .LBB92_34
+; RV64ZVE32F-NEXT: .LBB92_60: # %cond.store43
; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
@@ -11220,7 +11206,7 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: slli a2, a1, 40
; RV64ZVE32F-NEXT: bgez a2, .LBB92_36
-; RV64ZVE32F-NEXT: .LBB92_60: # %cond.store45
+; RV64ZVE32F-NEXT: .LBB92_61: # %cond.store45
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
; RV64ZVE32F-NEXT: vmv.x.s a2, v11
@@ -11230,7 +11216,7 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: slli a2, a1, 39
; RV64ZVE32F-NEXT: bgez a2, .LBB92_37
-; RV64ZVE32F-NEXT: .LBB92_61: # %cond.store47
+; RV64ZVE32F-NEXT: .LBB92_62: # %cond.store47
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
@@ -11239,33 +11225,25 @@ define void @mscatter_baseidx_v32i8(<32 x i8> %val, ptr %base, <32 x i8> %idxs,
; RV64ZVE32F-NEXT: slli a2, a1, 38
; RV64ZVE32F-NEXT: bltz a2, .LBB92_38
; RV64ZVE32F-NEXT: j .LBB92_39
-; RV64ZVE32F-NEXT: .LBB92_62: # %cond.store51
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
-; RV64ZVE32F-NEXT: add a2, a0, a2
-; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 26
-; RV64ZVE32F-NEXT: vse8.v v12, (a2)
-; RV64ZVE32F-NEXT: slli a2, a1, 36
-; RV64ZVE32F-NEXT: bgez a2, .LBB92_41
; RV64ZVE32F-NEXT: .LBB92_63: # %cond.store53
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma
-; RV64ZVE32F-NEXT: vslidedown.vi v10, v10, 1
-; RV64ZVE32F-NEXT: vmv.x.s a2, v10
+; RV64ZVE32F-NEXT: vslidedown.vi v11, v11, 1
+; RV64ZVE32F-NEXT: vmv.x.s a2, v11
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 27
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: slli a2, a1, 35
-; RV64ZVE32F-NEXT: bgez a2, .LBB92_42
+; RV64ZVE32F-NEXT: bgez a2, .LBB92_43
; RV64ZVE32F-NEXT: .LBB92_64: # %cond.store55
; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, m2, ta, ma
-; RV64ZVE32F-NEXT: vmv.x.s a2, v11
+; RV64ZVE32F-NEXT: vmv.x.s a2, v10
; RV64ZVE32F-NEXT: add a2, a0, a2
; RV64ZVE32F-NEXT: vslidedown.vi v12, v8, 28
; RV64ZVE32F-NEXT: vse8.v v12, (a2)
; RV64ZVE32F-NEXT: slli a2, a1, 34
-; RV64ZVE32F-NEXT: bltz a2, .LBB92_43
-; RV64ZVE32F-NEXT: j .LBB92_44
+; RV64ZVE32F-NEXT: bltz a2, .LBB92_44
+; RV64ZVE32F-NEXT: j .LBB92_45
%ptrs = getelementptr inbounds i8, ptr %base, <32 x i8> %idxs
call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %val, <32 x ptr> %ptrs, i32 1, <32 x i1> %m)
ret void
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
index 73312bbaa415a2..a772f4d466ccbb 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-fp-vp.ll
@@ -541,6 +541,57 @@ define <8 x i1> @fcmp_uno_vf_swap_v8f16(<8 x half> %va, half %b, <8 x i1> %m, i3
declare <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half>, <128 x half>, metadata, <128 x i1>, i32)
define <128 x i1> @fcmp_oeq_vv_v128f16(<128 x half> %va, <128 x half> %vb, <128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fcmp_oeq_vv_v128f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: li a1, 64
+; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, a0, 128
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
+; CHECK-NEXT: addi a0, a2, -64
+; CHECK-NEXT: sltu a3, a2, a0
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a0, a3, a0
+; CHECK-NEXT: vslidedown.vi v0, v0, 8
+; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT: vmfeq.vv v1, v16, v8, v0.t
+; CHECK-NEXT: bltu a2, a1, .LBB43_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a2, 64
+; CHECK-NEXT: .LBB43_2:
+; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v16, v1, 8
+; CHECK-NEXT: vmv.v.v v0, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%v = call <128 x i1> @llvm.vp.fcmp.v128f16(<128 x half> %va, <128 x half> %vb, metadata !"oeq", <128 x i1> %m, i32 %evl)
ret <128 x i1> %v
}
@@ -1109,45 +1160,44 @@ define <32 x i1> @fcmp_oeq_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x
; CHECK-NEXT: slli a1, a1, 4
; CHECK-NEXT: sub sp, sp, a1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT: addi a1, a0, 128
-; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a1)
+; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
-; CHECK-NEXT: vslidedown.vi v1, v0, 2
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma
-; CHECK-NEXT: vle64.v v24, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: li a1, 16
-; CHECK-NEXT: mv a0, a2
-; CHECK-NEXT: bltu a2, a1, .LBB87_2
-; CHECK-NEXT: # %bb.1:
-; CHECK-NEXT: li a0, 16
-; CHECK-NEXT: .LBB87_2:
-; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v2, v8, v24, v0.t
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, a0, 128
+; CHECK-NEXT: vle64.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; CHECK-NEXT: addi a0, a2, -16
; CHECK-NEXT: sltu a1, a2, a0
; CHECK-NEXT: addi a1, a1, -1
; CHECK-NEXT: and a0, a1, a0
+; CHECK-NEXT: vslidedown.vi v0, v0, 2
; CHECK-NEXT: vsetvli zero, a0, e64, m8, ta, ma
-; CHECK-NEXT: vmv1r.v v0, v1
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmfeq.vv v1, v16, v8, v0.t
+; CHECK-NEXT: bltu a2, a0, .LBB87_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a2, 16
+; CHECK-NEXT: .LBB87_2:
+; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vmfeq.vv v24, v16, v8, v0.t
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmfeq.vv v16, v8, v24, v0.t
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
-; CHECK-NEXT: vslideup.vi v2, v24, 2
-; CHECK-NEXT: vmv1r.v v0, v2
+; CHECK-NEXT: vslideup.vi v16, v1, 2
+; CHECK-NEXT: vmv1r.v v0, v16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
index dad260e1fac592..fa91394de8ba44 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-setcc-int-vp.ll
@@ -1315,109 +1315,57 @@ define <8 x i1> @icmp_sle_vi_swap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext
declare <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32>, <64 x i32>, metadata, <64 x i1>, i32)
define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m, i32 zeroext %evl) {
-; RV32-LABEL: icmp_eq_vv_v64i32:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -16
-; RV32-NEXT: .cfi_def_cfa_offset 16
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 4
-; RV32-NEXT: sub sp, sp, a1
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV32-NEXT: addi a1, a0, 128
-; RV32-NEXT: li a3, 32
-; RV32-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV32-NEXT: vle32.v v24, (a1)
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 3
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 16
-; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT: vle32.v v24, (a0)
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV32-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV32-NEXT: vslidedown.vi v1, v0, 4
-; RV32-NEXT: mv a0, a2
-; RV32-NEXT: bltu a2, a3, .LBB99_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a0, 32
-; RV32-NEXT: .LBB99_2:
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vmseq.vv v2, v8, v24, v0.t
-; RV32-NEXT: addi a0, a2, -32
-; RV32-NEXT: sltu a1, a2, a0
-; RV32-NEXT: addi a1, a1, -1
-; RV32-NEXT: and a0, a1, a0
-; RV32-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV32-NEXT: vmseq.vv v24, v16, v8, v0.t
-; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV32-NEXT: vslideup.vi v2, v24, 4
-; RV32-NEXT: vmv1r.v v0, v2
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 4
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: addi sp, sp, 16
-; RV32-NEXT: ret
-;
-; RV64-LABEL: icmp_eq_vv_v64i32:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -16
-; RV64-NEXT: .cfi_def_cfa_offset 16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 4
-; RV64-NEXT: sub sp, sp, a1
-; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; RV64-NEXT: addi a1, a0, 128
-; RV64-NEXT: li a3, 32
-; RV64-NEXT: vsetvli zero, a3, e32, m8, ta, ma
-; RV64-NEXT: vle32.v v24, (a1)
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 3
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 16
-; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV64-NEXT: vle32.v v24, (a0)
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill
-; RV64-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; RV64-NEXT: mv a0, a2
-; RV64-NEXT: vslidedown.vi v1, v0, 4
-; RV64-NEXT: bltu a2, a3, .LBB99_2
-; RV64-NEXT: # %bb.1:
-; RV64-NEXT: li a0, 32
-; RV64-NEXT: .LBB99_2:
-; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vmseq.vv v2, v8, v24, v0.t
-; RV64-NEXT: addi a0, a2, -32
-; RV64-NEXT: sltu a1, a2, a0
-; RV64-NEXT: addi a1, a1, -1
-; RV64-NEXT: and a0, a1, a0
-; RV64-NEXT: vsetvli zero, a0, e32, m8, ta, ma
-; RV64-NEXT: vmv1r.v v0, v1
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
-; RV64-NEXT: vmseq.vv v24, v16, v8, v0.t
-; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; RV64-NEXT: vslideup.vi v2, v24, 4
-; RV64-NEXT: vmv1r.v v0, v2
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 4
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: addi sp, sp, 16
-; RV64-NEXT: ret
+; CHECK-LABEL: icmp_eq_vv_v64i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 4
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: vmv1r.v v24, v0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: li a1, 32
+; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: addi a3, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT: addi a0, a0, 128
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
+; CHECK-NEXT: addi a0, a2, -32
+; CHECK-NEXT: sltu a3, a2, a0
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a0, a3, a0
+; CHECK-NEXT: vslidedown.vi v0, v0, 4
+; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT: vmseq.vv v1, v16, v8, v0.t
+; CHECK-NEXT: bltu a2, a1, .LBB99_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: .LBB99_2:
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT: vmv1r.v v0, v24
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vmseq.vv v16, v8, v24, v0.t
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v16, v1, 4
+; CHECK-NEXT: vmv1r.v v0, v16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%v = call <64 x i1> @llvm.vp.icmp.v64i32(<64 x i32> %va, <64 x i32> %vb, metadata !"eq", <64 x i1> %m, i32 %evl)
ret <64 x i1> %v
}
@@ -1425,26 +1373,26 @@ define <64 x i1> @icmp_eq_vv_v64i32(<64 x i32> %va, <64 x i32> %vb, <64 x i1> %m
define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: icmp_eq_vx_v64i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT: li a3, 32
-; CHECK-NEXT: vslidedown.vi v24, v0, 4
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: bltu a1, a3, .LBB100_2
-; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: vslidedown.vi v0, v0, 4
+; CHECK-NEXT: addi a2, a1, -32
+; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
+; CHECK-NEXT: bltu a1, a2, .LBB100_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a1, 32
; CHECK-NEXT: .LBB100_2:
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT: addi a2, a1, -32
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t
+; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vi v25, v8, 4
-; CHECK-NEXT: vmv1r.v v0, v25
+; CHECK-NEXT: vslideup.vi v16, v25, 4
+; CHECK-NEXT: vmv1r.v v0, v16
; CHECK-NEXT: ret
%elt.head = insertelement <64 x i32> poison, i32 %b, i32 0
%vb = shufflevector <64 x i32> %elt.head, <64 x i32> poison, <64 x i32> zeroinitializer
@@ -1455,26 +1403,26 @@ define <64 x i1> @icmp_eq_vx_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 ze
define <64 x i1> @icmp_eq_vx_swap_v64i32(<64 x i32> %va, i32 %b, <64 x i1> %m, i32 zeroext %evl) {
; CHECK-LABEL: icmp_eq_vx_swap_v64i32:
; CHECK: # %bb.0:
+; CHECK-NEXT: vmv1r.v v24, v0
; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, ma
-; CHECK-NEXT: li a3, 32
-; CHECK-NEXT: vslidedown.vi v24, v0, 4
-; CHECK-NEXT: mv a2, a1
-; CHECK-NEXT: bltu a1, a3, .LBB101_2
-; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: vslidedown.vi v0, v0, 4
+; CHECK-NEXT: addi a2, a1, -32
+; CHECK-NEXT: sltu a3, a1, a2
+; CHECK-NEXT: addi a3, a3, -1
+; CHECK-NEXT: and a2, a3, a2
+; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; CHECK-NEXT: li a2, 32
+; CHECK-NEXT: vmseq.vx v25, v16, a0, v0.t
+; CHECK-NEXT: bltu a1, a2, .LBB101_2
+; CHECK-NEXT: # %bb.1:
+; CHECK-NEXT: li a1, 32
; CHECK-NEXT: .LBB101_2:
-; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vmseq.vx v25, v8, a0, v0.t
-; CHECK-NEXT: addi a2, a1, -32
-; CHECK-NEXT: sltu a1, a1, a2
-; CHECK-NEXT: addi a1, a1, -1
-; CHECK-NEXT: and a1, a1, a2
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
; CHECK-NEXT: vmv1r.v v0, v24
-; CHECK-NEXT: vmseq.vx v8, v16, a0, v0.t
+; CHECK-NEXT: vmseq.vx v16, v8, a0, v0.t
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vi v25, v8, 4
-; CHECK-NEXT: vmv1r.v v0, v25
+; CHECK-NEXT: vslideup.vi v16, v25, 4
+; CHECK-NEXT: vmv1r.v v0, v16
; CHECK-NEXT: ret
%elt.head = insertelement <64 x i32> poison, i32 %b, i32 0
%vb = shufflevector <64 x i32> %elt.head, <64 x i32> poison, <64 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll
index 84e51be5cf2869..3826f85518af9c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwadd.ll
@@ -91,37 +91,26 @@ define <64 x float> @vfwadd_v64f16(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vfwadd.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfwadd.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwadd.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -203,36 +192,24 @@ define <32 x double> @vfwadd_v32f32(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vfwadd.vv v8, v16, v24
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwadd.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfwadd.vv v8, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -398,10 +375,10 @@ define <32 x double> @vfwadd_vf_v32f32(ptr %x, float %y) {
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vfmv.v.f v16, fa0
-; CHECK-NEXT: vfwcvt.f.f.v v8, v16
-; CHECK-NEXT: vfwadd.wv v16, v8, v0
-; CHECK-NEXT: vfwadd.wv v8, v8, v24
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: vfwcvt.f.f.v v16, v8
+; CHECK-NEXT: vfwadd.wv v8, v16, v24
+; CHECK-NEXT: vfwadd.wv v16, v16, v0
; CHECK-NEXT: ret
%a = load <32 x float>, ptr %x
%b = insertelement <32 x float> poison, float %y, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll
index 5093c22ba246ea..d799079efe628d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmul.ll
@@ -91,37 +91,26 @@ define <64 x float> @vfwmul_v64f16(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vfwmul.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfwmul.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwmul.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -203,36 +192,24 @@ define <32 x double> @vfwmul_v32f32(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vfwmul.vv v8, v16, v24
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwmul.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfwmul.vv v8, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -394,18 +371,17 @@ define <32 x double> @vfwmul_vf_v32f32(ptr %x, float %y) {
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v16, (a0)
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vfwcvt.f.f.v v8, v16
+; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v16, 16
+; CHECK-NEXT: vslidedown.vi v16, v8, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vfwcvt.f.f.v v24, v16
-; CHECK-NEXT: vfmv.v.f v16, fa0
-; CHECK-NEXT: vfwcvt.f.f.v v0, v16
+; CHECK-NEXT: vfwcvt.f.f.v v16, v8
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: vfwcvt.f.f.v v0, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vfmul.vv v8, v16, v0
; CHECK-NEXT: vfmul.vv v16, v24, v0
-; CHECK-NEXT: vfmul.vv v8, v8, v0
; CHECK-NEXT: ret
%a = load <32 x float>, ptr %x
%b = insertelement <32 x float> poison, float %y, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll
index 10f1a6f13abe7c..b60f0c352c5289 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwsub.ll
@@ -91,37 +91,26 @@ define <64 x float> @vfwsub_v64f16(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vfwsub.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfwsub.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwsub.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -203,36 +192,24 @@ define <32 x double> @vfwsub_v32f32(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vfwsub.vv v8, v16, v24
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vfwsub.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vfwsub.vv v8, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -394,18 +371,17 @@ define <32 x double> @vfwsub_vf_v32f32(ptr %x, float %y) {
; CHECK: # %bb.0:
; CHECK-NEXT: li a1, 32
; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v16, (a0)
-; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vfwcvt.f.f.v v8, v16
+; CHECK-NEXT: vle32.v v8, (a0)
; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v16, 16
+; CHECK-NEXT: vslidedown.vi v16, v8, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; CHECK-NEXT: vfwcvt.f.f.v v24, v16
-; CHECK-NEXT: vfmv.v.f v16, fa0
-; CHECK-NEXT: vfwcvt.f.f.v v0, v16
+; CHECK-NEXT: vfwcvt.f.f.v v16, v8
+; CHECK-NEXT: vfmv.v.f v8, fa0
+; CHECK-NEXT: vfwcvt.f.f.v v0, v8
; CHECK-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; CHECK-NEXT: vfsub.vv v8, v16, v0
; CHECK-NEXT: vfsub.vv v16, v24, v0
-; CHECK-NEXT: vfsub.vv v8, v8, v0
; CHECK-NEXT: ret
%a = load <32 x float>, ptr %x
%b = insertelement <32 x float> poison, float %y, i32 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
index 02d5fe491ea377..4451bce44a4b85 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpgather.ll
@@ -2468,32 +2468,31 @@ define <32 x double> @vpgather_baseidx_zext_v32i32_v32f64(ptr %base, <32 x i32>
define <32 x double> @vpgather_baseidx_v32f64(ptr %base, <32 x i64> %idxs, <32 x i1> %m, i32 zeroext %evl) {
; RV32-LABEL: vpgather_baseidx_v32f64:
; RV32: # %bb.0:
-; RV32-NEXT: vmv1r.v v1, v0
; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma
; RV32-NEXT: vnsrl.wi v24, v16, 0
; RV32-NEXT: vnsrl.wi v16, v8, 0
; RV32-NEXT: li a2, 32
; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma
; RV32-NEXT: vslideup.vi v16, v24, 16
-; RV32-NEXT: vsll.vi v24, v16, 3
+; RV32-NEXT: li a3, 16
+; RV32-NEXT: vsll.vi v16, v16, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: bltu a1, a3, .LBB96_2
+; RV32-NEXT: # %bb.1:
+; RV32-NEXT: li a2, 16
+; RV32-NEXT: .LBB96_2:
+; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT: vluxei32.v v8, (a0), v16, v0.t
; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v24, 16
+; RV32-NEXT: vslidedown.vi v24, v16, 16
; RV32-NEXT: addi a2, a1, -16
-; RV32-NEXT: sltu a3, a1, a2
-; RV32-NEXT: addi a3, a3, -1
-; RV32-NEXT: and a2, a3, a2
+; RV32-NEXT: sltu a1, a1, a2
+; RV32-NEXT: addi a1, a1, -1
+; RV32-NEXT: and a1, a1, a2
; RV32-NEXT: vsetivli zero, 2, e8, mf4, ta, ma
; RV32-NEXT: vslidedown.vi v0, v0, 2
-; RV32-NEXT: vsetvli zero, a2, e64, m8, ta, ma
-; RV32-NEXT: vluxei32.v v16, (a0), v8, v0.t
-; RV32-NEXT: li a2, 16
-; RV32-NEXT: bltu a1, a2, .LBB96_2
-; RV32-NEXT: # %bb.1:
-; RV32-NEXT: li a1, 16
-; RV32-NEXT: .LBB96_2:
; RV32-NEXT: vsetvli zero, a1, e64, m8, ta, ma
-; RV32-NEXT: vmv1r.v v0, v1
-; RV32-NEXT: vluxei32.v v8, (a0), v24, v0.t
+; RV32-NEXT: vluxei32.v v16, (a0), v24, v0.t
; RV32-NEXT: ret
;
; RV64-LABEL: vpgather_baseidx_v32f64:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
index f30fc791d107ff..0b219a2ef22ed7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll
@@ -250,36 +250,25 @@ define <128 x i16> @vwadd_v128i16(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: vle8.v v24, (a1)
; CHECK-NEXT: li a0, 64
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwadd.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwadd.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwadd.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -296,36 +285,25 @@ define <64 x i32> @vwadd_v64i32(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwadd.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwadd.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwadd.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -342,35 +320,23 @@ define <32 x i64> @vwadd_v32i64(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwadd.vv v8, v16, v24
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwadd.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwadd.vv v8, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
index eda443beaa4775..5d850159b95b3d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll
@@ -250,36 +250,25 @@ define <128 x i16> @vwaddu_v128i16(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: vle8.v v24, (a1)
; CHECK-NEXT: li a0, 64
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwaddu.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwaddu.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwaddu.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -296,36 +285,25 @@ define <64 x i32> @vwaddu_v64i32(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwaddu.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwaddu.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwaddu.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -342,35 +320,23 @@ define <32 x i64> @vwaddu_v32i64(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwaddu.vv v8, v16, v24
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwaddu.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwaddu.vv v8, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
index 63de84a35022cd..14f56af1bf4c61 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmul.ll
@@ -275,37 +275,26 @@ define <128 x i16> @vwmul_v128i16(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: vle8.v v24, (a1)
; CHECK-NEXT: li a0, 64
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwmul.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwmul.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwmul.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -323,37 +312,26 @@ define <64 x i32> @vwmul_v64i32(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwmul.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwmul.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwmul.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -371,36 +349,24 @@ define <32 x i64> @vwmul_v32i64(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwmul.vv v8, v16, v24
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwmul.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwmul.vv v8, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
index 75868977d1162d..3685bf7dbe78e5 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll
@@ -267,37 +267,26 @@ define <128 x i16> @vwmulsu_v128i16(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: vle8.v v24, (a1)
; CHECK-NEXT: li a0, 64
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwmulsu.vv v8, v24, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwmulsu.vv v8, v0, v16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwmulsu.vv v16, v0, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -315,37 +304,26 @@ define <64 x i32> @vwmulsu_v64i32(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwmulsu.vv v8, v24, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwmulsu.vv v8, v0, v16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwmulsu.vv v16, v0, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -363,36 +341,24 @@ define <32 x i64> @vwmulsu_v32i64(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwmulsu.vv v8, v24, v16
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwmulsu.vv v16, v0, v24
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwmulsu.vv v8, v0, v16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
index b2ce75d122e253..5d45668968f305 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulu.ll
@@ -251,37 +251,26 @@ define <128 x i16> @vwmulu_v128i16(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: vle8.v v24, (a1)
; CHECK-NEXT: li a0, 64
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwmulu.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwmulu.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwmulu.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -299,37 +288,26 @@ define <64 x i32> @vwmulu_v64i32(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwmulu.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwmulu.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwmulu.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -347,36 +325,24 @@ define <32 x i64> @vwmulu_v32i64(ptr %x, ptr %y) {
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwmulu.vv v8, v16, v24
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwmulu.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwmulu.vv v8, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
index a8120b4a49bd5a..41656c454de7b6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll
@@ -250,36 +250,25 @@ define <128 x i16> @vwsub_v128i16(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: vle8.v v24, (a1)
; CHECK-NEXT: li a0, 64
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwsub.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwsub.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwsub.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -296,36 +285,25 @@ define <64 x i32> @vwsub_v64i32(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwsub.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwsub.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwsub.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -342,35 +320,23 @@ define <32 x i64> @vwsub_v32i64(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwsub.vv v8, v16, v24
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwsub.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwsub.vv v8, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index 019df06366aff9..88cc94511231db 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -250,36 +250,25 @@ define <128 x i16> @vwsubu_v128i16(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 128
; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, ma
-; CHECK-NEXT: vle8.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle8.v v0, (a1)
+; CHECK-NEXT: vle8.v v16, (a0)
+; CHECK-NEXT: vle8.v v24, (a1)
; CHECK-NEXT: li a0, 64
; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwsubu.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwsubu.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwsubu.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -296,36 +285,25 @@ define <64 x i32> @vwsubu_v64i32(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 64
; CHECK-NEXT: vsetvli zero, a2, e16, m8, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle16.v v0, (a1)
+; CHECK-NEXT: vle16.v v16, (a0)
+; CHECK-NEXT: vle16.v v24, (a1)
; CHECK-NEXT: li a0, 32
; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma
-; CHECK-NEXT: vslidedown.vx v16, v8, a0
-; CHECK-NEXT: vslidedown.vx v8, v0, a0
+; CHECK-NEXT: vslidedown.vx v8, v16, a0
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT: vslidedown.vx v0, v24, a0
; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwsubu.vv v8, v16, v24
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwsubu.vv v8, v16, v0
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwsubu.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
@@ -342,35 +320,23 @@ define <32 x i64> @vwsubu_v32i64(ptr %x, ptr %y) nounwind {
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: slli a2, a2, 4
+; CHECK-NEXT: slli a2, a2, 3
; CHECK-NEXT: sub sp, sp, a2
; CHECK-NEXT: li a2, 32
; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vle32.v v16, (a0)
+; CHECK-NEXT: vle32.v v24, (a1)
+; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
+; CHECK-NEXT: vslidedown.vi v8, v16, 16
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: vle32.v v0, (a1)
-; CHECK-NEXT: vsetivli zero, 16, e32, m8, ta, ma
-; CHECK-NEXT: vslidedown.vi v16, v8, 16
-; CHECK-NEXT: vslidedown.vi v8, v0, 16
+; CHECK-NEXT: vslidedown.vi v0, v24, 16
; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
-; CHECK-NEXT: vmv4r.v v24, v8
; CHECK-NEXT: vwsubu.vv v8, v16, v24
+; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT: vwsubu.vv v16, v24, v0
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: vwsubu.vv v8, v16, v0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 3
-; CHECK-NEXT: add a0, sp, a0
-; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
index 00609a17f1efc8..5b6fc2209910af 100644
--- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
@@ -258,16 +258,16 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, ma
; RV32-NEXT: vle16.v v16, (a0)
; RV32-NEXT: vmv2r.v v20, v10
-; RV32-NEXT: vmv2r.v v12, v8
-; RV32-NEXT: vrgather.vv v8, v12, v16
-; RV32-NEXT: vid.v v12
-; RV32-NEXT: vrsub.vi v12, v12, 15
+; RV32-NEXT: vrgather.vv v12, v8, v16
+; RV32-NEXT: vid.v v8
+; RV32-NEXT: vrsub.vi v8, v8, 15
; RV32-NEXT: lui a0, 16
; RV32-NEXT: addi a0, a0, -1
; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; RV32-NEXT: vmv.v.x v0, a0
; RV32-NEXT: vsetvli zero, a1, e16, m4, ta, mu
-; RV32-NEXT: vrgather.vv v8, v20, v12, v0.t
+; RV32-NEXT: vrgather.vv v12, v20, v8, v0.t
+; RV32-NEXT: vmv.v.v v8, v12
; RV32-NEXT: ret
;
; RV64-LABEL: v16i16_2:
@@ -278,16 +278,16 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, ma
; RV64-NEXT: vle16.v v16, (a0)
; RV64-NEXT: vmv2r.v v20, v10
-; RV64-NEXT: vmv2r.v v12, v8
-; RV64-NEXT: vrgather.vv v8, v12, v16
-; RV64-NEXT: vid.v v12
-; RV64-NEXT: vrsub.vi v12, v12, 15
+; RV64-NEXT: vrgather.vv v12, v8, v16
+; RV64-NEXT: vid.v v8
+; RV64-NEXT: vrsub.vi v8, v8, 15
; RV64-NEXT: lui a0, 16
; RV64-NEXT: addiw a0, a0, -1
; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
; RV64-NEXT: vmv.v.x v0, a0
; RV64-NEXT: vsetvli zero, a1, e16, m4, ta, mu
-; RV64-NEXT: vrgather.vv v8, v20, v12, v0.t
+; RV64-NEXT: vrgather.vv v12, v20, v8, v0.t
+; RV64-NEXT: vmv.v.v v8, v12
; RV64-NEXT: ret
%v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <32 x i16> %v32i16
diff --git a/llvm/test/CodeGen/RISCV/rvv/splats-with-mixed-vl.ll b/llvm/test/CodeGen/RISCV/rvv/splats-with-mixed-vl.ll
index d9d6d4982d7777..fc67eec0f48a05 100644
--- a/llvm/test/CodeGen/RISCV/rvv/splats-with-mixed-vl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/splats-with-mixed-vl.ll
@@ -159,10 +159,8 @@ define <vscale x 1 x i32> @extract_vector_multiuse2(ptr %p, ptr %p2, i32 %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v8, a2
-; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a2
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v9, (a0)
+; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x i32> poison, i32 %v, i32 0
%splat = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
@@ -177,10 +175,8 @@ define void @extract_vector_mixed1(ptr %p, ptr %p2, i32 %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v8, a2
-; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a2
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v9, (a0)
+; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
; CHECK-NEXT: vse32.v v8, (a1)
; CHECK-NEXT: ret
@@ -200,8 +196,6 @@ define void @extract_vector_mixed2(ptr %p, ptr %p2, i32 %v) {
; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v8, a2
; CHECK-NEXT: vse32.v v8, (a0)
-; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v8, a2
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
; CHECK-NEXT: vse32.v v8, (a1)
; CHECK-NEXT: ret
@@ -219,12 +213,9 @@ define void @extract_vector_mixed3(ptr %p, ptr %p2, i32 %v) {
; CHECK: # %bb.0:
; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
; CHECK-NEXT: vmv.v.x v8, a2
-; CHECK-NEXT: vsetvli a3, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv.v.x v9, a2
-; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
; CHECK-NEXT: vse32.v v8, (a0)
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT: vse32.v v9, (a1)
+; CHECK-NEXT: vse32.v v8, (a1)
; CHECK-NEXT: ret
%elt.head = insertelement <vscale x 1 x i32> poison, i32 %v, i32 0
%splat = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 205965f62d3d0b..0dd57e1be277bd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -107,10 +107,10 @@ ret {<4 x i32>, <4 x i32>} %retval
define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
; CHECK-LABEL: vector_deinterleave_v2i64_v4i64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 2
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.i v0, 2
+; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 2
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; CHECK-NEXT: vrgather.vi v9, v8, 1
; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t
@@ -194,10 +194,10 @@ ret {<4 x float>, <4 x float>} %retval
define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) {
; CHECK-LABEL: vector_deinterleave_v2f64_v4f64:
; CHECK: # %bb.0:
-; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma
-; CHECK-NEXT: vslidedown.vi v10, v8, 2
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
; CHECK-NEXT: vmv.v.i v0, 2
+; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 2
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
; CHECK-NEXT: vrgather.vi v9, v8, 1
; CHECK-NEXT: vrgather.vi v9, v10, 1, v0.t
diff --git a/llvm/test/CodeGen/X86/pr33349.ll b/llvm/test/CodeGen/X86/pr33349.ll
index 5addaca47c86ee..6b8d41fb9ba424 100644
--- a/llvm/test/CodeGen/X86/pr33349.ll
+++ b/llvm/test/CodeGen/X86/pr33349.ll
@@ -10,20 +10,20 @@ target triple = "x86_64-unknown-linux-gnu"
; KNL: # %bb.0: # %bb
; KNL-NEXT: vpslld $31, %xmm0, %xmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT: kshiftrw $1, %k0, %k1
+; KNL-NEXT: kmovw %k1, %eax
; KNL-NEXT: kshiftrw $2, %k0, %k1
; KNL-NEXT: kshiftrw $1, %k1, %k2
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k1, %ecx
; KNL-NEXT: testb $1, %al
; KNL-NEXT: fld1
; KNL-NEXT: fldz
; KNL-NEXT: fld %st(0)
; KNL-NEXT: fcmovne %st(2), %st
-; KNL-NEXT: kmovw %k2, %eax
-; KNL-NEXT: testb $1, %al
+; KNL-NEXT: testb $1, %cl
; KNL-NEXT: fld %st(1)
; KNL-NEXT: fcmovne %st(3), %st
-; KNL-NEXT: kshiftrw $1, %k0, %k1
-; KNL-NEXT: kmovw %k1, %eax
+; KNL-NEXT: kmovw %k2, %eax
; KNL-NEXT: testb $1, %al
; KNL-NEXT: fld %st(2)
; KNL-NEXT: fcmovne %st(4), %st
@@ -35,10 +35,10 @@ target triple = "x86_64-unknown-linux-gnu"
; KNL-NEXT: fxch %st(3)
; KNL-NEXT: fstpt (%rdi)
; KNL-NEXT: fxch %st(1)
-; KNL-NEXT: fstpt 10(%rdi)
-; KNL-NEXT: fxch %st(1)
; KNL-NEXT: fstpt 30(%rdi)
+; KNL-NEXT: fxch %st(1)
; KNL-NEXT: fstpt 20(%rdi)
+; KNL-NEXT: fstpt 10(%rdi)
; KNL-NEXT: vzeroupper
; KNL-NEXT: retq
;
@@ -46,20 +46,20 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX: # %bb.0: # %bb
; SKX-NEXT: vpslld $31, %xmm0, %xmm0
; SKX-NEXT: vpmovd2m %xmm0, %k0
+; SKX-NEXT: kshiftrb $1, %k0, %k1
+; SKX-NEXT: kmovd %k1, %eax
; SKX-NEXT: kshiftrb $2, %k0, %k1
; SKX-NEXT: kshiftrb $1, %k1, %k2
-; SKX-NEXT: kmovd %k1, %eax
+; SKX-NEXT: kmovd %k1, %ecx
; SKX-NEXT: testb $1, %al
; SKX-NEXT: fld1
; SKX-NEXT: fldz
; SKX-NEXT: fld %st(0)
; SKX-NEXT: fcmovne %st(2), %st
-; SKX-NEXT: kmovd %k2, %eax
-; SKX-NEXT: testb $1, %al
+; SKX-NEXT: testb $1, %cl
; SKX-NEXT: fld %st(1)
; SKX-NEXT: fcmovne %st(3), %st
-; SKX-NEXT: kshiftrb $1, %k0, %k1
-; SKX-NEXT: kmovd %k1, %eax
+; SKX-NEXT: kmovd %k2, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: fld %st(2)
; SKX-NEXT: fcmovne %st(4), %st
@@ -71,10 +71,10 @@ target triple = "x86_64-unknown-linux-gnu"
; SKX-NEXT: fxch %st(3)
; SKX-NEXT: fstpt (%rdi)
; SKX-NEXT: fxch %st(1)
-; SKX-NEXT: fstpt 10(%rdi)
-; SKX-NEXT: fxch %st(1)
; SKX-NEXT: fstpt 30(%rdi)
+; SKX-NEXT: fxch %st(1)
; SKX-NEXT: fstpt 20(%rdi)
+; SKX-NEXT: fstpt 10(%rdi)
; SKX-NEXT: retq
bb:
%tmp = select <4 x i1> %m, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/pr34177.ll b/llvm/test/CodeGen/X86/pr34177.ll
index 2653fcdcb75cad..49dd684025498d 100644
--- a/llvm/test/CodeGen/X86/pr34177.ll
+++ b/llvm/test/CodeGen/X86/pr34177.ll
@@ -49,20 +49,20 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
; AVX512VL-LABEL: test:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %k0
-; AVX512VL-NEXT: kshiftrb $1, %k0, %k1
-; AVX512VL-NEXT: kshiftrb $2, %k0, %k2
+; AVX512VL-NEXT: kshiftrb $2, %k0, %k1
+; AVX512VL-NEXT: kshiftrb $1, %k0, %k2
; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: testb $1, %al
; AVX512VL-NEXT: fld1
; AVX512VL-NEXT: fldz
; AVX512VL-NEXT: fld %st(0)
; AVX512VL-NEXT: fcmovne %st(2), %st
-; AVX512VL-NEXT: kmovd %k1, %eax
+; AVX512VL-NEXT: kshiftrb $1, %k1, %k0
+; AVX512VL-NEXT: kmovd %k0, %eax
; AVX512VL-NEXT: testb $1, %al
; AVX512VL-NEXT: fld %st(1)
; AVX512VL-NEXT: fcmovne %st(3), %st
-; AVX512VL-NEXT: kshiftrb $1, %k2, %k0
-; AVX512VL-NEXT: kmovd %k0, %eax
+; AVX512VL-NEXT: kmovd %k1, %eax
; AVX512VL-NEXT: testb $1, %al
; AVX512VL-NEXT: fld %st(2)
; AVX512VL-NEXT: fcmovne %st(4), %st
@@ -83,11 +83,11 @@ define void @test(<4 x i64> %a, <4 x x86_fp80> %b, ptr %c) local_unnamed_addr {
; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt (%rdi)
; AVX512VL-NEXT: fadd %st, %st(0)
-; AVX512VL-NEXT: fstpt 20(%rdi)
-; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt 60(%rdi)
; AVX512VL-NEXT: fadd %st, %st(0)
; AVX512VL-NEXT: fstpt 40(%rdi)
+; AVX512VL-NEXT: fadd %st, %st(0)
+; AVX512VL-NEXT: fstpt 20(%rdi)
%1 = icmp eq <4 x i64> <i64 0, i64 1, i64 2, i64 3>, %a
%2 = select <4 x i1> %1, <4 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000>, <4 x x86_fp80> zeroinitializer
%3 = fadd <4 x x86_fp80> %2, %2
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 0ec81c8077cd49..d5351ab0a96959 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -64,7 +64,7 @@ define void @mask_replication_factor2_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512DQ-LABEL: mask_replication_factor2_vf4:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: kmovb (%rdi), %k0
+; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT: vpmovsxdq %xmm0, %ymm0
; AVX512DQ-NEXT: vpmovd2m %ymm0, %k1
@@ -75,7 +75,7 @@ define void @mask_replication_factor2_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor2_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
+; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-NEXT: vpmovsxdq %xmm0, %ymm0
@@ -496,7 +496,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512DQ-LABEL: mask_replication_factor3_vf2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: kmovb (%rdi), %k0
+; AVX512DQ-NEXT: kmovw (%rdi), %k0
; AVX512DQ-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u>
; AVX512DQ-NEXT: vpermd %ymm0, %ymm1, %ymm0
@@ -513,7 +513,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor3_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
+; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u>
@@ -572,7 +572,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor3_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
+; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u>
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
@@ -840,31 +840,29 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovd (%rdi), %k0
; AVX512BW-NEXT: kshiftrd $1, %k0, %k1
; AVX512BW-NEXT: movw $-3, %ax
-; AVX512BW-NEXT: kmovd %eax, %k4
-; AVX512BW-NEXT: kmovw (%rdi), %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k3
-; AVX512BW-NEXT: kmovq %k4, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
-; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovd %eax, %k3
+; AVX512BW-NEXT: kandw %k3, %k0, %k2
+; AVX512BW-NEXT: kmovq %k3, %k7
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k3
+; AVX512BW-NEXT: kshiftrw $14, %k3, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k3, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $13, %k3, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovq %k3, %k4
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-17, %ax
-; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kmovd %eax, %k5
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-33, %ax
@@ -917,22 +915,22 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $4, %k0, %k7
-; AVX512BW-NEXT: kshiftlw $15, %k7, %k2
+; AVX512BW-NEXT: kshiftrd $4, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k2
; AVX512BW-NEXT: kshiftrw $3, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
-; AVX512BW-NEXT: kmovd %eax, %k5
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kmovd %eax, %k6
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $2, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k7, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrd $5, %k0, %k2
@@ -942,141 +940,142 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $27, %k0, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k7
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k4
; AVX512BW-NEXT: kshiftrd $26, %k0, %k1
; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovq %k6, %k2
-; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k7, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovq %k7, %k2
+; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $14, %k4, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $13, %k7, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $12, %k7, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $28, %k0, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
-; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
+; AVX512BW-NEXT: kshiftrw $13, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $12, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $28, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $11, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $9, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $29, %k0, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
-; AVX512BW-NEXT: kshiftrw $8, %k6, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $10, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $9, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $29, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $8, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $6, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $30, %k0, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
-; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $7, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $30, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $5, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $4, %k4, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $3, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $31, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k7
+; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $31, %k0, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k7
-; AVX512BW-NEXT: kshiftrw $2, %k7, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k6, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $21, %k0, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k5
+; AVX512BW-NEXT: kandw %k2, %k1, %k6
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k5, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $22, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $13, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k6, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $22, %k0, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $12, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $11, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $23, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $10, %k5, %k6
+; AVX512BW-NEXT: kshiftrd $23, %k0, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $9, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $24, %k0, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $8, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $24, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $25, %k0, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $25, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
@@ -1086,63 +1085,63 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrd $17, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $12, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $11, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $18, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $8, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $19, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $6, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $20, %k0, %k2
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k5
-; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k6
+; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
@@ -1152,128 +1151,128 @@ define void @mask_replication_factor3_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: kshiftrd $11, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
-; AVX512BW-NEXT: kshiftrd $10, %k0, %k5
-; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kshiftrd $10, %k0, %k4
+; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
-; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
-; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k5, %k2
+; AVX512BW-NEXT: korw %k2, %k4, %k2
; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $12, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $11, %k5, %k6
+; AVX512BW-NEXT: kshiftrd $12, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $11, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $10, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $9, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $13, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $8, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $10, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $9, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kshiftrd $13, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $8, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $6, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $7, %k4, %k6
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $14, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
+; AVX512BW-NEXT: kshiftrd $14, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $5, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $4, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $3, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $15, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $3, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
+; AVX512BW-NEXT: kshiftrd $15, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm4 {%k2} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $14, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $6, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $13, %k5, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $14, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kshiftrd $6, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $13, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $12, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $12, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $11, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $7, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $10, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $11, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kshiftrd $7, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $10, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $9, %k5, %k6
+; AVX512BW-NEXT: kshiftrw $9, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $8, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $8, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $8, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
+; AVX512BW-NEXT: kshiftrd $8, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $7, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $6, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $5, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $5, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrd $9, %k0, %k0
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $4, %k0, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
@@ -1447,19 +1446,18 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovq (%rdi), %k0
; AVX512BW-NEXT: kshiftrq $1, %k0, %k1
; AVX512BW-NEXT: movw $-3, %ax
-; AVX512BW-NEXT: kmovd %eax, %k4
-; AVX512BW-NEXT: kmovw (%rdi), %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k3
-; AVX512BW-NEXT: kmovq %k4, %k7
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
-; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovd %eax, %k3
+; AVX512BW-NEXT: kandw %k3, %k0, %k2
+; AVX512BW-NEXT: kmovq %k3, %k7
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k3
+; AVX512BW-NEXT: kshiftrw $14, %k3, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k3, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $13, %k3, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
@@ -1475,9 +1473,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovq %k3, %k5
; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: movw $-65, %ax
@@ -1490,8 +1487,9 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovq %k3, %k5
+; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $8, %k2, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
@@ -1549,47 +1547,47 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $59, %k0, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: kshiftrq $58, %k0, %k2
-; AVX512BW-NEXT: kmovq %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
+; AVX512BW-NEXT: kshiftrq $58, %k0, %k1
+; AVX512BW-NEXT: kmovq %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512BW-NEXT: kmovq %k7, %k3
; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $14, %k2, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $13, %k1, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $13, %k2, %k7
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $12, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k2, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrq $60, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrq $61, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
@@ -1598,12 +1596,12 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k2, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kandw %k6, %k1, %k1
@@ -1611,8 +1609,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k2, %k7
; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
@@ -1620,143 +1618,142 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $53, %k0, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k2
+; AVX512BW-NEXT: kandw %k3, %k1, %k6
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k6, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $54, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $55, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrq $56, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrq $57, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 8-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z}
-; AVX512BW-NEXT: kshiftrq $48, %k0, %k2
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT: kshiftrq $48, %k0, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
+; AVX512BW-NEXT: kandw %k3, %k1, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k3, %k2
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k3, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrq $49, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $50, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrq $51, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $52, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: korw %k1, %k2, %k1
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $43, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
@@ -1766,48 +1763,49 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $44, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrq $45, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrq $46, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
@@ -1819,8 +1817,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
@@ -1828,7 +1825,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $37, %k0, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k3
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
@@ -1839,48 +1837,48 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrq $39, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrq $40, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
+; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrq $41, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
@@ -1907,20 +1905,19 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrq $33, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrq $34, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
@@ -1930,7 +1927,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -1939,14 +1937,14 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrq $36, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
@@ -1956,8 +1954,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
@@ -1966,102 +1964,102 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $27, %k0, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrq $26, %k0, %k3
; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k3, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $12, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k3, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrq $28, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftrq $29, %k0, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrq $29, %k0, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrq $30, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrq $31, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z}
-; AVX512BW-NEXT: kshiftrq $21, %k0, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k1} {z}
+; AVX512BW-NEXT: kshiftrq $21, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrq $22, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrq $23, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -2074,105 +2072,104 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrq $25, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 8-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k3, %k1
-; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z}
-; AVX512BW-NEXT: kshiftrq $16, %k0, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k3, %k2
+; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k2} {z}
+; AVX512BW-NEXT: kshiftrq $16, %k0, %k2
+; AVX512BW-NEXT: kandw %k5, %k2, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $13, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k3, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $13, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k3, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrq $17, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrq $18, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrq $19, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrq $20, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: kshiftrq $11, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
@@ -2182,28 +2179,29 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrq $12, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrq $13, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k3, %k6
@@ -2212,8 +2210,7 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
@@ -2222,12 +2219,12 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k7, %k2, %k2
@@ -2244,11 +2241,11 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k2} {z}
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrq $6, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
@@ -2272,7 +2269,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -2281,7 +2279,8 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -2289,13 +2288,11 @@ define void @mask_replication_factor3_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $5, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrq $9, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -2361,7 +2358,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512DQ-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512DQ-SLOW: # %bb.0:
-; AVX512DQ-SLOW-NEXT: kmovb (%rdi), %k0
+; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0
; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
@@ -2373,7 +2370,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512DQ-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512DQ-FAST: # %bb.0:
-; AVX512DQ-FAST-NEXT: kmovb (%rdi), %k0
+; AVX512DQ-FAST-NEXT: kmovw (%rdi), %k0
; AVX512DQ-FAST-NEXT: vpmovm2d %k0, %ymm0
; AVX512DQ-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
@@ -2385,7 +2382,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: kmovw (%rdi), %k1
+; AVX512BW-SLOW-NEXT: kmovq (%rdi), %k1
; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
@@ -2398,7 +2395,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: kmovw (%rdi), %k1
+; AVX512BW-FAST-NEXT: kmovq (%rdi), %k1
; AVX512BW-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512BW-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512BW-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
@@ -2411,7 +2408,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512VBMI-SLOW-LABEL: mask_replication_factor4_vf2:
; AVX512VBMI-SLOW: # %bb.0:
-; AVX512VBMI-SLOW-NEXT: kmovw (%rdi), %k1
+; AVX512VBMI-SLOW-NEXT: kmovq (%rdi), %k1
; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VBMI-SLOW-NEXT: vpmovsxdq %xmm0, %xmm0
@@ -2424,7 +2421,7 @@ define void @mask_replication_factor4_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512VBMI-FAST-LABEL: mask_replication_factor4_vf2:
; AVX512VBMI-FAST: # %bb.0:
-; AVX512VBMI-FAST-NEXT: kmovw (%rdi), %k1
+; AVX512VBMI-FAST-NEXT: kmovq (%rdi), %k1
; AVX512VBMI-FAST-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VBMI-FAST-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,1,1,1,1]
@@ -2470,7 +2467,7 @@ define void @mask_replication_factor4_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor4_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
+; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
@@ -3154,7 +3151,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor5_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
+; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
@@ -3218,7 +3215,7 @@ define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor5_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovd (%rdi), %k0
+; AVX512BW-NEXT: kmovq (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
@@ -3585,14 +3582,12 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
;
; AVX512BW-LABEL: mask_replication_factor5_vf32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovd (%rdi), %k5
-; AVX512BW-NEXT: kshiftrd $1, %k5, %k1
+; AVX512BW-NEXT: kmovd (%rdi), %k0
+; AVX512BW-NEXT: kshiftrd $1, %k0, %k1
; AVX512BW-NEXT: movw $-3, %ax
-; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kmovw (%rdi), %k2
-; AVX512BW-NEXT: kandw %k3, %k2, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kmovd %eax, %k6
+; AVX512BW-NEXT: kandw %k6, %k0, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-5, %ax
@@ -3639,23 +3634,23 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $7, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
-; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kmovd %eax, %k7
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
+; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k2, %k1, %k3
-; AVX512BW-NEXT: kshiftrd $2, %k5, %k1
+; AVX512BW-NEXT: kshiftrd $2, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
; AVX512BW-NEXT: kshiftrw $5, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
-; AVX512BW-NEXT: kmovd %eax, %k7
-; AVX512BW-NEXT: kandw %k7, %k3, %k3
-; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kmovd %eax, %k4
+; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
@@ -3671,29 +3666,29 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $2, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
-; AVX512BW-NEXT: kmovd %eax, %k6
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kmovd %eax, %k3
+; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $3, %k5, %k2
+; AVX512BW-NEXT: kshiftrd $3, %k0, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $29, %k5, %k1
+; AVX512BW-NEXT: kshiftrd $29, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
-; AVX512BW-NEXT: kshiftrd $28, %k5, %k1
+; AVX512BW-NEXT: kshiftrd $28, %k0, %k1
; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k1, %k3
+; AVX512BW-NEXT: kandw %k6, %k1, %k3
+; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -3710,7 +3705,7 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $30, %k5, %k3
+; AVX512BW-NEXT: kshiftrd $30, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
@@ -3722,16 +3717,16 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $7, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $31, %k5, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kshiftrd $31, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k4
; AVX512BW-NEXT: kshiftrw $4, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
@@ -3743,96 +3738,98 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k4, %k7
; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm1 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $25, %k5, %k3
-; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k4
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k2
+; AVX512BW-NEXT: kshiftrd $25, %k0, %k2
+; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kandw %k6, %k2, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k2, %k7
-; AVX512BW-NEXT: korw %k7, %k4, %k4
-; AVX512BW-NEXT: kandw %k0, %k4, %k4
-; AVX512BW-NEXT: kshiftrd $26, %k5, %k7
+; AVX512BW-NEXT: korw %k7, %k3, %k3
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: kshiftrd $26, %k0, %k7
+; AVX512BW-NEXT: kmovq %k0, %k4
+; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k7, %k7
; AVX512BW-NEXT: kshiftrw $13, %k7, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k7, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k7, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k7, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k4, %k4
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k7, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
-; AVX512BW-NEXT: kshiftrd $27, %k5, %k6
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kshiftrd $27, %k4, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $8, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k4, %k4
+; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: korw %k7, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k4, %k4
-; AVX512BW-NEXT: kandw %k1, %k4, %k4
+; AVX512BW-NEXT: korw %k7, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: korw %k7, %k3, %k3
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k4, %k4
+; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k4, %k4
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k4, %k4
+; AVX512BW-NEXT: korw %k7, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftlw $14, %k0, %k1
-; AVX512BW-NEXT: korw %k1, %k4, %k1
+; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm2 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $22, %k5, %k0
-; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $22, %k0, %k3
+; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k0, %k7
+; AVX512BW-NEXT: kandw %k1, %k3, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k6, %k6
-; AVX512BW-NEXT: kshiftrw $13, %k0, %k7
+; AVX512BW-NEXT: kshiftrw $13, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kandw %k3, %k6, %k6
-; AVX512BW-NEXT: kshiftrd $23, %k5, %k7
-; AVX512BW-NEXT: kmovq %k5, %k0
+; AVX512BW-NEXT: kandw %k5, %k6, %k6
+; AVX512BW-NEXT: kshiftrd $23, %k0, %k7
; AVX512BW-NEXT: kshiftlw $15, %k7, %k7
; AVX512BW-NEXT: kshiftrw $12, %k7, %k5
; AVX512BW-NEXT: korw %k5, %k6, %k5
@@ -3852,8 +3849,7 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k5, %k5
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrd $24, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
@@ -3862,10 +3858,11 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
@@ -3878,77 +3875,76 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k2, %k3
-; AVX512BW-NEXT: korw %k3, %k5, %k3
-; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
-; AVX512BW-NEXT: korw %k7, %k3, %k2
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k5, %k2
+; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT: korw %k7, %k2, %k2
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm3 {%k2} {z}
; AVX512BW-NEXT: kshiftrd $19, %k0, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k2, %k3
+; AVX512BW-NEXT: kandw %k7, %k2, %k4
; AVX512BW-NEXT: kshiftlw $15, %k2, %k6
; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k6, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrw $13, %k6, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrw $12, %k6, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrd $20, %k0, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrw $10, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $9, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $8, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
+; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $7, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k4, %k4
+; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrd $21, %k0, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: kandw %k3, %k4, %k4
; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k3, %k1
+; AVX512BW-NEXT: korw %k1, %k4, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
@@ -3971,16 +3967,16 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k3, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $17, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k3, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
@@ -4024,53 +4020,53 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrd $12, %k0, %k3
; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kandw %k2, %k3, %k2
; AVX512BW-NEXT: kshiftrw $14, %k1, %k4
-; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k1, %k4
-; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k1, %k4
-; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k1, %k4
-; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k3, %k1
+; AVX512BW-NEXT: korw %k1, %k2, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $14, %k0, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k4
+; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $14, %k0, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $9, %k3, %k4
-; AVX512BW-NEXT: korw %k4, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $8, %k3, %k4
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $7, %k3, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $7, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $6, %k3, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $15, %k0, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k4
+; AVX512BW-NEXT: kshiftrw $5, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $15, %k0, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k4
; AVX512BW-NEXT: kshiftrw $4, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
@@ -4079,17 +4075,17 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm6 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $9, %k0, %k3
-; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kshiftrd $9, %k0, %k2
+; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k4
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k1
+; AVX512BW-NEXT: kandw %k1, %k2, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
@@ -4105,7 +4101,8 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k7, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $10, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -4126,16 +4123,15 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
+; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftrw $4, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k4, %k4
+; AVX512BW-NEXT: kandw %k3, %k4, %k4
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $15, %k7, %k5
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
@@ -4144,63 +4140,63 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $2, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
-; AVX512BW-NEXT: kshiftlw $14, %k7, %k2
-; AVX512BW-NEXT: korw %k2, %k4, %k2
-; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k2} {z}
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k4, %k4
+; AVX512BW-NEXT: kshiftlw $14, %k7, %k3
+; AVX512BW-NEXT: korw %k3, %k4, %k3
+; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm7 {%k3} {z}
; AVX512BW-NEXT: kshiftrd $6, %k0, %k4
; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k4, %k5
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $14, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $13, %k4, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrd $7, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k5, %k5
; AVX512BW-NEXT: kshiftrd $8, %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k5, %k5
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
@@ -4208,69 +4204,69 @@ define void @mask_replication_factor5_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $3, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k5, %k1
-; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k5, %k2
+; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm8 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $14, %k5, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $13, %k5, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $12, %k5, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $4, %k0, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $11, %k3, %k5
+; AVX512BW-NEXT: kshiftrw $14, %k5, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $13, %k5, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $12, %k5, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $4, %k0, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $11, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $10, %k3, %k5
+; AVX512BW-NEXT: kshiftrw $10, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $8, %k3, %k5
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $7, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrd $5, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $6, %k0, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k0, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k0, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $5, %k0, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $4, %k0, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
@@ -4518,9 +4514,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kmovw (%rdi), %k2
-; AVX512BW-NEXT: kandw %k1, %k2, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kandw %k1, %k5, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k5, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: movw $-5, %ax
@@ -4640,8 +4635,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -4650,12 +4645,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
@@ -4699,11 +4694,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $8, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
@@ -4716,12 +4712,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -4768,29 +4763,29 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT: korw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $12, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
@@ -4832,15 +4827,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kmovq %k4, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -4849,16 +4844,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
+; AVX512BW-NEXT: korw %k7, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -4896,32 +4890,32 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $18, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -4956,12 +4950,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -4970,22 +4963,23 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $22, %k5, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
@@ -4997,8 +4991,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5007,20 +5001,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5037,12 +5031,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5051,8 +5044,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -5063,59 +5056,60 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $26, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $27, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT: korw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $28, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -5128,20 +5122,19 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5150,19 +5143,19 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5171,16 +5164,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -5193,10 +5185,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
@@ -5213,16 +5207,16 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5239,12 +5233,11 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -5274,17 +5267,18 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $37, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
@@ -5294,20 +5288,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $38, %k5, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
@@ -5342,8 +5336,7 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5352,8 +5345,7 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
@@ -5363,7 +5355,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5372,37 +5365,38 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $42, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5411,20 +5405,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5437,32 +5431,31 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k6
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k6} {z}
-; AVX512BW-NEXT: kandw %k3, %k1, %k0
+; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrq $45, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5487,21 +5480,22 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $47, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -5514,36 +5508,35 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $49, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5552,12 +5545,12 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
@@ -5596,17 +5589,18 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $53, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
@@ -5615,16 +5609,15 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5651,8 +5644,8 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
@@ -5663,7 +5656,7 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5672,29 +5665,28 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrq $57, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -5715,49 +5707,49 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $59, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $60, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -5770,20 +5762,20 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -5792,35 +5784,35 @@ define void @mask_replication_factor5_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $63, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftlw $15, %k5, %k1
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k4
; AVX512BW-NEXT: korw %k4, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k5, %k2
; AVX512BW-NEXT: korw %k2, %k0, %k0
@@ -5892,7 +5884,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor6_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
+; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u>
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
@@ -5997,7 +5989,7 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor6_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovd (%rdi), %k0
+; AVX512BW-NEXT: kmovq (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
@@ -6377,9 +6369,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kmovw (%rdi), %k1
-; AVX512BW-NEXT: kandw %k0, %k1, %k2
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT: kandw %k0, %k5, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k5, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-5, %ax
@@ -6389,15 +6380,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $13, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-9, %ax
-; AVX512BW-NEXT: kmovd %eax, %k7
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k0
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k0, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: movw $-17, %ax
+; AVX512BW-NEXT: kmovd %eax, %k7
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
+; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-33, %ax
@@ -6427,15 +6418,15 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $7, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
-; AVX512BW-NEXT: kmovd %eax, %k0
-; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $6, %k1, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k6
; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $6, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
+; AVX512BW-NEXT: kmovd %eax, %k0
+; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k0, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k1, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
@@ -6477,107 +6468,107 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
-; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $14, %k2, %k3
+; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
-; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $13, %k2, %k3
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $12, %k2, %k3
+; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
-; AVX512BW-NEXT: korw %k4, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $30, %k5, %k4
-; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $11, %k4, %k7
+; AVX512BW-NEXT: kshiftrd $30, %k5, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $11, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $10, %k4, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $10, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $9, %k4, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $9, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $8, %k4, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $8, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $7, %k4, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $7, %k3, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
-; AVX512BW-NEXT: korw %k4, %k1, %k1
-; AVX512BW-NEXT: kandw %k6, %k1, %k4
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
+; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k3
; AVX512BW-NEXT: kshiftrd $31, %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k7, %k1
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftlw $14, %k7, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
-; AVX512BW-NEXT: korw %k1, %k4, %k1
+; AVX512BW-NEXT: korw %k6, %k3, %k3
+; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT: korw %k1, %k3, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm1 {%k1} {z}
-; AVX512BW-NEXT: kmovq %k5, %k1
-; AVX512BW-NEXT: kshiftrd $26, %k5, %k5
-; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k5, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k4
-; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k4, %k7
+; AVX512BW-NEXT: kshiftrd $26, %k5, %k3
+; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k3, %k1
+; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kandw %k0, %k6, %k6
-; AVX512BW-NEXT: kshiftrd $27, %k1, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k6, %k6
+; AVX512BW-NEXT: kshiftrd $27, %k5, %k7
+; AVX512BW-NEXT: kmovq %k5, %k3
+; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftlw $15, %k7, %k7
; AVX512BW-NEXT: kshiftrw $13, %k7, %k5
; AVX512BW-NEXT: korw %k5, %k6, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kshiftrw $12, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $9, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kshiftrd $28, %k1, %k6
-; AVX512BW-NEXT: kmovq %k1, %k4
+; AVX512BW-NEXT: kshiftrd $28, %k3, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
@@ -6589,147 +6580,149 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k0, %k3
-; AVX512BW-NEXT: korw %k3, %k5, %k3
-; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: korw %k0, %k3, %k2
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k4
+; AVX512BW-NEXT: korw %k4, %k5, %k4
+; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: korw %k2, %k4, %k2
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm2 {%k2} {z}
-; AVX512BW-NEXT: kmovq %k4, %k0
-; AVX512BW-NEXT: kshiftrd $24, %k4, %k2
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $24, %k0, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k3
+; AVX512BW-NEXT: kandw %k4, %k2, %k4
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
; AVX512BW-NEXT: kshiftrw $14, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: kandw %k5, %k4, %k4
; AVX512BW-NEXT: kshiftrw $13, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: kandw %k5, %k4, %k4
; AVX512BW-NEXT: kshiftrw $12, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k2, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k4, %k4
; AVX512BW-NEXT: kshiftrw $10, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k3, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $25, %k0, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
+; AVX512BW-NEXT: korw %k2, %k4, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kshiftrd $25, %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $8, %k3, %k5
+; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $7, %k3, %k5
+; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $6, %k3, %k5
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $6, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $5, %k3, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $5, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $4, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $3, %k5, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $2, %k5, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k1, %k3
+; AVX512BW-NEXT: kshiftrw $2, %k0, %k4
+; AVX512BW-NEXT: kmovq %k0, %k1
+; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k2, %k2
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
-; AVX512BW-NEXT: korw %k5, %k2, %k1
+; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm3 {%k1} {z}
-; AVX512BW-NEXT: kmovq %k0, %k1
-; AVX512BW-NEXT: kshiftrd $21, %k0, %k3
-; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kandw %k4, %k3, %k2
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
-; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k3, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $21, %k2, %k1
+; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $13, %k3, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k1, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k5
+; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k5, %k4
+; AVX512BW-NEXT: korw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $12, %k3, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $22, %k1, %k4
-; AVX512BW-NEXT: kmovq %k1, %k7
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $13, %k5, %k4
+; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $12, %k5, %k4
+; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: kshiftrd $22, %k2, %k4
+; AVX512BW-NEXT: kmovq %k2, %k6
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k4
-; AVX512BW-NEXT: kshiftrd $23, %k7, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k2
-; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
+; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k4
+; AVX512BW-NEXT: kshiftrd $23, %k6, %k5
+; AVX512BW-NEXT: kmovq %k6, %k7
+; AVX512BW-NEXT: kshiftlw $15, %k5, %k3
+; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k4, %k4
@@ -6737,232 +6730,233 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
-; AVX512BW-NEXT: korw %k2, %k4, %k2
-; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-NEXT: korw %k3, %k4, %k3
+; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm4 {%k3} {z}
; AVX512BW-NEXT: kmovq %k7, %k4
; AVX512BW-NEXT: kshiftrd $18, %k7, %k6
; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k2
-; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k6, %k5
+; AVX512BW-NEXT: kshiftlw $15, %k6, %k3
+; AVX512BW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrd $19, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k5, %k5
; AVX512BW-NEXT: kshiftrw $9, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrd $20, %k4, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k1, %k3
-; AVX512BW-NEXT: korw %k3, %k5, %k3
-; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
+; AVX512BW-NEXT: korw %k2, %k5, %k2
+; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: korw %k1, %k3, %k1
+; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $16, %k4, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k3
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $16, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k3, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $17, %k4, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $8, %k3, %k5
+; AVX512BW-NEXT: kshiftrd $17, %k0, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $7, %k3, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $6, %k3, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $7, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k3, %k5
+; AVX512BW-NEXT: kshiftrw $6, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $4, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $5, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $3, %k2, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $3, %k3, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k2, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $2, %k3, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k0, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
-; AVX512BW-NEXT: kmovq %k4, %k0
-; AVX512BW-NEXT: kshiftrd $13, %k4, %k3
-; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $13, %k0, %k2
+; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k2
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k4
-; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k4, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $13, %k4, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $12, %k4, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
-; AVX512BW-NEXT: kshiftrd $14, %k0, %k3
+; AVX512BW-NEXT: kandw %k1, %k2, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k2, %k4
+; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $13, %k2, %k4
+; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $12, %k2, %k4
+; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kshiftrd $14, %k0, %k4
; AVX512BW-NEXT: kmovq %k0, %k7
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $11, %k3, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $10, %k3, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $9, %k3, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $8, %k3, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $7, %k3, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $6, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k3
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
+; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k3, %k4
; AVX512BW-NEXT: kshiftrd $15, %k7, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k2
-; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
-; AVX512BW-NEXT: korw %k6, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k3, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k5, %k3
+; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
+; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
+; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k4, %k4
; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
-; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
-; AVX512BW-NEXT: korw %k2, %k3, %k2
-; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $10, %k7, %k2
-; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kandw %k1, %k2, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k1
-; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT: korw %k5, %k4, %k4
+; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
+; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
+; AVX512BW-NEXT: korw %k3, %k4, %k3
+; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm7 {%k3} {z}
+; AVX512BW-NEXT: kmovq %k7, %k3
+; AVX512BW-NEXT: kshiftrd $10, %k7, %k0
+; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kandw %k1, %k0, %k5
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrd $11, %k7, %k6
-; AVX512BW-NEXT: kmovq %k7, %k2
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -6973,137 +6967,135 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $8, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kshiftrd $12, %k2, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftrd $12, %k3, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kandw %k4, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k5, %k5
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k0, %k4
-; AVX512BW-NEXT: korw %k4, %k5, %k4
-; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: korw %k0, %k4, %k1
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT: kshiftlw $14, %k1, %k2
+; AVX512BW-NEXT: korw %k2, %k5, %k2
+; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm8 {%k1} {z}
-; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kshiftrd $8, %k2, %k1
+; AVX512BW-NEXT: kshiftrd $8, %k3, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k4
+; AVX512BW-NEXT: kandw %k6, %k1, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k4, %k1
+; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $9, %k2, %k4
-; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
+; AVX512BW-NEXT: kshiftrd $9, %k3, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $7, %k4, %k5
+; AVX512BW-NEXT: kshiftrw $7, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $6, %k4, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k4, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $5, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $4, %k4, %k4
-; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $3, %k2, %k4
-; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $3, %k3, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k2, %k4
-; AVX512BW-NEXT: kmovq %k2, %k5
-; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $2, %k3, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: korw %k3, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512BW-NEXT: kshiftrd $5, %k1, %k4
-; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kandw %k6, %k4, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k4, %k7
+; AVX512BW-NEXT: kshiftrd $5, %k1, %k2
+; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kandw %k6, %k2, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k7
; AVX512BW-NEXT: kshiftrw $14, %k7, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k7, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k7, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrd $6, %k1, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kandw %k0, %k3, %k3
@@ -7131,8 +7123,8 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovq %k2, %k6
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k4, %k4
; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k4, %k4
; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
@@ -7140,34 +7132,33 @@ define void @mask_replication_factor6_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k3, %k4, %k3
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm10 {%k3} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kshiftrw $14, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrd $3, %k1, %k4
; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
; AVX512BW-NEXT: kshiftrw $13, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k4, %k5
; AVX512BW-NEXT: korw %k5, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k4, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
@@ -7473,11 +7464,10 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: movw $-3, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kmovw (%rdi), %k0
-; AVX512BW-NEXT: kandw %k1, %k0, %k3
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k0
-; AVX512BW-NEXT: korw %k0, %k3, %k0
+; AVX512BW-NEXT: kandw %k1, %k5, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k5, %k1
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
@@ -7578,20 +7568,20 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -7606,8 +7596,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
@@ -7804,8 +7794,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
@@ -7830,8 +7820,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -7859,18 +7849,18 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $15, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovq %k4, %k3
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
@@ -7896,49 +7886,51 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $17, %k5, %k1
+; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $18, %k5, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $18, %k7, %k1
+; AVX512BW-NEXT: kmovq %k7, %k3
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
@@ -7955,16 +7947,17 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $19, %k5, %k1
+; AVX512BW-NEXT: kmovq %k3, %k7
+; AVX512BW-NEXT: kshiftrq $19, %k3, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
@@ -7981,31 +7974,31 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $20, %k5, %k1
+; AVX512BW-NEXT: kshiftrq $20, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $21, %k5, %k1
+; AVX512BW-NEXT: kmovq %k7, %k4
+; AVX512BW-NEXT: kshiftrq $21, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -8021,61 +8014,63 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
-; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $22, %k5, %k1
+; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $22, %k4, %k1
+; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT: korw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $23, %k5, %k6
+; AVX512BW-NEXT: kshiftrq $23, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm8 {%k1} {z}
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $24, %k5, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
@@ -8093,12 +8088,12 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $25, %k5, %k1
+; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
@@ -8106,15 +8101,16 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
+; AVX512BW-NEXT: korw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
@@ -8123,7 +8119,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $26, %k5, %k1
+; AVX512BW-NEXT: kshiftrq $26, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
@@ -8144,8 +8140,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k2, %k0, %k0
-; AVX512BW-NEXT: kmovq %k5, %k7
-; AVX512BW-NEXT: kshiftrq $27, %k5, %k1
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT: kshiftrq $27, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
@@ -8157,46 +8153,44 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrq $28, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrq $29, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
@@ -8213,8 +8207,7 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
@@ -8223,42 +8216,43 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k5, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $31, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
@@ -8292,102 +8286,104 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $33, %k5, %k1
+; AVX512BW-NEXT: kmovq %k5, %k7
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $34, %k5, %k1
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $34, %k7, %k1
+; AVX512BW-NEXT: kmovq %k7, %k4
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k7} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $35, %k5, %k1
+; AVX512BW-NEXT: kmovq %k4, %k7
+; AVX512BW-NEXT: kshiftrq $35, %k4, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $36, %k5, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $36, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $37, %k5, %k1
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovq %k7, %k3
+; AVX512BW-NEXT: kshiftrq $37, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -8395,29 +8391,30 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k7} {z}
-; AVX512BW-NEXT: kandw %k4, %k1, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $38, %k5, %k1
+; AVX512BW-NEXT: kshiftrq $38, %k3, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
@@ -8428,105 +8425,107 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $39, %k5, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $39, %k3, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 896(%rsi), %zmm14 {%k1} {z}
-; AVX512BW-NEXT: kshiftrq $40, %k5, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kmovq %k3, %k7
+; AVX512BW-NEXT: kshiftrq $40, %k3, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $41, %k5, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $41, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $42, %k5, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $42, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 960(%rsi), %zmm15 {%k7} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k1, %k0
+; AVX512BW-NEXT: kandw %k3, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $43, %k5, %k1
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT: kshiftrq $43, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
@@ -8534,13 +8533,14 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
@@ -8550,33 +8550,31 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $44, %k5, %k1
+; AVX512BW-NEXT: kshiftrq $44, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $45, %k5, %k1
+; AVX512BW-NEXT: kshiftrq $45, %k7, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -8584,70 +8582,72 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k7} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k1, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $46, %k5, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT: kshiftrq $46, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $47, %k5, %k6
+; AVX512BW-NEXT: kshiftrq $47, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k1} {z}
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $48, %k5, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
@@ -8655,45 +8655,46 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $49, %k5, %k1
+; AVX512BW-NEXT: kmovq %k5, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $50, %k5, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $50, %k2, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
@@ -8701,8 +8702,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -8715,16 +8716,17 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $51, %k5, %k1
+; AVX512BW-NEXT: kmovq %k2, %k7
+; AVX512BW-NEXT: kshiftrq $51, %k2, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
@@ -8735,36 +8737,37 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $52, %k5, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $52, %k7, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $53, %k5, %k1
+; AVX512BW-NEXT: kshiftrq $53, %k7, %k1
+; AVX512BW-NEXT: kmovq %k7, %k4
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -8780,83 +8783,85 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
-; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $54, %k5, %k1
+; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovq %k4, %k7
+; AVX512BW-NEXT: kshiftrq $54, %k4, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $55, %k5, %k6
+; AVX512BW-NEXT: kandw %k5, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $55, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $56, %k5, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrq $57, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
@@ -8865,20 +8870,18 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -8899,7 +8902,8 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
-; AVX512BW-NEXT: kandw %k2, %k1, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -8908,17 +8912,18 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
@@ -8932,24 +8937,22 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -8988,40 +8991,40 @@ define void @mask_replication_factor6_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $63, %k5, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k6
+; AVX512BW-NEXT: kshiftrq $63, %k5, %k0
; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k0
-; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k4
-; AVX512BW-NEXT: korw %k4, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k6, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $3, %k1, %k5
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k4
+; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: korw %k0, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT: korw %k0, %k2, %k0
+; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 %zmm23, 1472(%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm22, 1408(%rdx)
@@ -9095,7 +9098,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor7_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
+; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u>
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
@@ -9105,9 +9108,9 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1}
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rdx)
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vmovq %xmm1, 48(%rdx)
; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, 48(%rdx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64
@@ -9164,7 +9167,7 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor7_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovd (%rdi), %k0
+; AVX512BW-NEXT: kmovq (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u>
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
@@ -9681,19 +9684,18 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
;
; AVX512BW-LABEL: mask_replication_factor7_vf32:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: kmovd (%rdi), %k6
; AVX512BW-NEXT: movw $-3, %ax
-; AVX512BW-NEXT: kmovd %eax, %k2
-; AVX512BW-NEXT: kmovw (%rdi), %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
-; AVX512BW-NEXT: kmovq %k2, %k6
-; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
+; AVX512BW-NEXT: kmovd %eax, %k0
+; AVX512BW-NEXT: kandw %k0, %k6, %k1
+; AVX512BW-NEXT: kmovq %k0, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kmovq %k2, %k4
+; AVX512BW-NEXT: kmovq %k2, %k3
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $13, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
@@ -9725,22 +9727,20 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovd (%rdi), %k3
-; AVX512BW-NEXT: kshiftrd $1, %k3, %k0
+; AVX512BW-NEXT: kshiftrd $1, %k6, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k2
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kmovq %k2, %k7
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
-; AVX512BW-NEXT: kmovd %eax, %k5
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kmovd %eax, %k7
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kshiftrw $6, %k0, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
@@ -9771,7 +9771,7 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrd $2, %k3, %k2
+; AVX512BW-NEXT: kshiftrd $2, %k6, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kshiftlw $14, %k2, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
@@ -9781,357 +9781,362 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
-; AVX512BW-NEXT: kshiftrd $29, %k3, %k1
+; AVX512BW-NEXT: kmovq %k6, %k2
+; AVX512BW-NEXT: kshiftrd $29, %k6, %k1
; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kandw %k6, %k1, %k0
+; AVX512BW-NEXT: kmovq %k4, %k6
+; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k4, %k1, %k0
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k2
-; AVX512BW-NEXT: korw %k2, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k2
-; AVX512BW-NEXT: kshiftrd $30, %k3, %k0
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $13, %k0, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $12, %k0, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $11, %k0, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $10, %k0, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $9, %k0, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $8, %k0, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k2, %k0
-; AVX512BW-NEXT: kandw %k5, %k0, %k2
-; AVX512BW-NEXT: kshiftrd $31, %k3, %k4
-; AVX512BW-NEXT: kmovd %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kshiftlw $15, %k4, %k0
-; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kshiftrd $30, %k2, %k1
+; AVX512BW-NEXT: kmovq %k2, %k4
+; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $13, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $10, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $9, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k7, %k0, %k3
+; AVX512BW-NEXT: kshiftrd $31, %k4, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
-; AVX512BW-NEXT: korw %k0, %k2, %k2
-; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k2} {z}
-; AVX512BW-NEXT: kshiftrd $27, %k3, %k2
-; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k0
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k4
-; AVX512BW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k4, %k7
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
+; AVX512BW-NEXT: korw %k0, %k3, %k0
+; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
+; AVX512BW-NEXT: korw %k1, %k0, %k1
+; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $27, %k2, %k1
+; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kandw %k6, %k1, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
+; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $13, %k4, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $13, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kandw %k6, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $12, %k4, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $12, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k7
-; AVX512BW-NEXT: kshiftrd $28, %k3, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k7
+; AVX512BW-NEXT: kshiftrd $28, %k2, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k7, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $9, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $8, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kandw %k5, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k6, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $4, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $3, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
-; AVX512BW-NEXT: kmovq %k3, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k5
; AVX512BW-NEXT: korw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
-; AVX512BW-NEXT: korw %k1, %k0, %k1
-; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT: korw %k7, %k0, %k2
+; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm2 {%k2} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $25, %k6, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k0
-; AVX512BW-NEXT: korw %k0, %k5, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $10, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k5
-; AVX512BW-NEXT: kshiftrd $26, %k6, %k0
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k5, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $2, %k6, %k5
-; AVX512BW-NEXT: korw %k5, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k2, %k4
-; AVX512BW-NEXT: korw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
-; AVX512BW-NEXT: korw %k6, %k0, %k2
-; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k2} {z}
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
-; AVX512BW-NEXT: kshiftrd $23, %k2, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k2
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT: kshiftrd $22, %k2, %k4
-; AVX512BW-NEXT: kmovd %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovq %k2, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k4, %k2
; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k0, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k5
+; AVX512BW-NEXT: kshiftrd $26, %k6, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $6, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kandw %k4, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $3, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $2, %k6, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT: kshiftlw $14, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT: korw %k6, %k2, %k1
+; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $23, %k3, %k1
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k2
+; AVX512BW-NEXT: kshiftrd $22, %k3, %k5
+; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kmovq %k3, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k3
+; AVX512BW-NEXT: kshiftrw $14, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $13, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $12, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $11, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $10, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $9, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k2, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k2
-; AVX512BW-NEXT: kshiftrd $24, %k6, %k0
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k5
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k3, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kshiftrd $24, %k6, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k3, %k5
; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k2, %k0
-; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
-; AVX512BW-NEXT: korw %k1, %k0, %k1
-; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k1} {z}
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
-; AVX512BW-NEXT: kshiftrd $20, %k6, %k0
-; AVX512BW-NEXT: kmovd %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kandw %k3, %k0, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
+; AVX512BW-NEXT: korw %k0, %k2, %k2
+; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm4 {%k2} {z}
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $20, %k3, %k5
+; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k1, %k5
-; AVX512BW-NEXT: kshiftrd $21, %k6, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $12, %k1, %k6
+; AVX512BW-NEXT: kandw %k0, %k5, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k5, %k6
+; AVX512BW-NEXT: kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k6, %k5
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $13, %k6, %k5
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k2, %k5
+; AVX512BW-NEXT: kshiftrd $21, %k3, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $11, %k1, %k6
+; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $10, %k1, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $10, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k7, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
+; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $6, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $6, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k5, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k1, %k5
+; AVX512BW-NEXT: kandw %k0, %k2, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $15, %k7, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k7, %k2
+; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k4, %k5, %k5
-; AVX512BW-NEXT: kshiftlw $14, %k7, %k4
-; AVX512BW-NEXT: korw %k4, %k5, %k4
-; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
-; AVX512BW-NEXT: korw %k1, %k4, %k1
+; AVX512BW-NEXT: kshiftlw $14, %k7, %k1
+; AVX512BW-NEXT: korw %k1, %k5, %k1
+; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload
-; AVX512BW-NEXT: kshiftrd $18, %k7, %k1
-; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k6
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $18, %k4, %k2
+; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k5
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k7
+; AVX512BW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $13, %k1, %k6
-; AVX512BW-NEXT: kmovq %k1, %k2
-; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $13, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
+; AVX512BW-NEXT: kshiftrw $12, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
+; AVX512BW-NEXT: kshiftrw $11, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kandw %k3, %k5, %k6
-; AVX512BW-NEXT: kshiftrd $19, %k7, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k6
+; AVX512BW-NEXT: kshiftrd $19, %k4, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $10, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
@@ -10139,99 +10144,98 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $9, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $8, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $6, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kandw %k0, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k6, %k6
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k6, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
-; AVX512BW-NEXT: kmovq %k2, %k7
+; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
+; AVX512BW-NEXT: kmovq %k3, %k7
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k5, %k2
-; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
-; AVX512BW-NEXT: korw %k7, %k2, %k2
-; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k2} {z}
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k3
+; AVX512BW-NEXT: korw %k3, %k5, %k3
+; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT: korw %k7, %k3, %k3
+; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm6 {%k3} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $16, %k1, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k3
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
; AVX512BW-NEXT: kshiftrw $12, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k2, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k2
+; AVX512BW-NEXT: korw %k0, %k3, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k3
; AVX512BW-NEXT: kshiftrd $17, %k1, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $3, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kandw %k1, %k3, %k3
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k2, %k0
+; AVX512BW-NEXT: korw %k0, %k3, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
@@ -10243,264 +10247,264 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k1, %k0, %k1
; AVX512BW-NEXT: vmovdqa32 448(%rsi), %zmm7 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
-; AVX512BW-NEXT: kshiftrd $13, %k0, %k2
-; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $14, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrd $13, %k0, %k1
+; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kandw %k6, %k1, %k2
-; AVX512BW-NEXT: kshiftrd $14, %k0, %k1
-; AVX512BW-NEXT: kmovq %k0, %k6
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k2, %k3
+; AVX512BW-NEXT: kshiftrd $14, %k0, %k2
+; AVX512BW-NEXT: kmovq %k0, %k1
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $13, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $12, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $10, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $11, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $9, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $10, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $8, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
-; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $7, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k2, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k2
-; AVX512BW-NEXT: kshiftrd $15, %k6, %k5
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k1
-; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k5
+; AVX512BW-NEXT: korw %k5, %k3, %k3
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $7, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k5
+; AVX512BW-NEXT: kshiftrd $15, %k1, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k3
+; AVX512BW-NEXT: kshiftrw $6, %k3, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $3, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $5, %k3, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $2, %k1, %k6
-; AVX512BW-NEXT: korw %k6, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k2, %k2
-; AVX512BW-NEXT: kshiftlw $14, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $4, %k3, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $3, %k3, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k0, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $2, %k3, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k5, %k2
; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
-; AVX512BW-NEXT: korw %k1, %k2, %k1
-; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k1} {z}
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
-; AVX512BW-NEXT: kshiftrd $11, %k2, %k6
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm8 {%k2} {z}
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $11, %k3, %k6
; AVX512BW-NEXT: kmovd %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k6, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k6, %k5
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k7, %k5, %k6
-; AVX512BW-NEXT: kshiftrd $12, %k2, %k5
+; AVX512BW-NEXT: kshiftrd $12, %k3, %k5
; AVX512BW-NEXT: kshiftlw $15, %k5, %k5
; AVX512BW-NEXT: kshiftrw $11, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $10, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $9, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $8, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $7, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kandw %k3, %k6, %k6
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $6, %k5, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k5, %k5
; AVX512BW-NEXT: korw %k5, %k6, %k5
-; AVX512BW-NEXT: kandw %k4, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $4, %k4, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $4, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $3, %k4, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $3, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
; AVX512BW-NEXT: kandw %k0, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $2, %k4, %k6
-; AVX512BW-NEXT: kmovq %k4, %k0
+; AVX512BW-NEXT: kshiftrw $2, %k7, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $14, %k4, %k4
-; AVX512BW-NEXT: korw %k4, %k5, %k4
-; AVX512BW-NEXT: kshiftlw $1, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $1, %k4, %k4
-; AVX512BW-NEXT: korw %k0, %k4, %k4
-; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k4} {z}
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 4-byte Reload
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k1
+; AVX512BW-NEXT: korw %k1, %k5, %k1
+; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 4-byte Reload
; AVX512BW-NEXT: kshiftrd $9, %k6, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k0, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k4, %k4
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k4, %k4
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kandw %k1, %k4, %k5
-; AVX512BW-NEXT: kshiftrd $10, %k6, %k4
-; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $9, %k4, %k6
+; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k5
+; AVX512BW-NEXT: kshiftrd $10, %k6, %k1
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $9, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $8, %k4, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $8, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $7, %k4, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $7, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $6, %k4, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $6, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kandw %k3, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $5, %k4, %k6
+; AVX512BW-NEXT: kandw %k4, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $5, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $4, %k4, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $4, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $3, %k4, %k4
-; AVX512BW-NEXT: korw %k4, %k5, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $2, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $3, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k5, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kshiftrw $2, %k4, %k5
+; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
-; AVX512BW-NEXT: korw %k2, %k4, %k2
-; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
-; AVX512BW-NEXT: korw %k1, %k2, %k1
+; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT: korw %k4, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm10 {%k1} {z}
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
-; AVX512BW-NEXT: kshiftrd $7, %k3, %k1
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $7, %k4, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
-; AVX512BW-NEXT: kshiftrd $6, %k3, %k2
-; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: kandw %k7, %k2, %k4
-; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
+; AVX512BW-NEXT: kshiftrd $6, %k4, %k5
+; AVX512BW-NEXT: kmovd %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; AVX512BW-NEXT: kmovq %k4, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: kandw %k2, %k5, %k2
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k5
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $12, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $11, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $10, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $9, %k1, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
; AVX512BW-NEXT: kshiftrw $8, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k4, %k1
+; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k4
-; AVX512BW-NEXT: kshiftrd $8, %k3, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k2
+; AVX512BW-NEXT: kshiftrd $8, %k6, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k5
; AVX512BW-NEXT: kshiftrw $7, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k4, %k4
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $6, %k5, %k6
+; AVX512BW-NEXT: korw %k6, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k2, %k2
; AVX512BW-NEXT: kshiftrw $5, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k4, %k4
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $4, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k4, %k4
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k5, %k6
-; AVX512BW-NEXT: korw %k6, %k4, %k4
+; AVX512BW-NEXT: korw %k6, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k4, %k4
+; AVX512BW-NEXT: kandw %k6, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k5, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k4, %k4
+; AVX512BW-NEXT: kandw %k5, %k2, %k2
; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
-; AVX512BW-NEXT: korw %k1, %k4, %k1
+; AVX512BW-NEXT: korw %k1, %k2, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
@@ -10509,120 +10513,120 @@ define void @mask_replication_factor7_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftrd $4, %k6, %k1
; AVX512BW-NEXT: kmovd %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k1, %k4
+; AVX512BW-NEXT: kandw %k0, %k1, %k2
; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k4, %k4
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $13, %k0, %k5
-; AVX512BW-NEXT: korw %k5, %k4, %k4
+; AVX512BW-NEXT: korw %k5, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k4, %k5
-; AVX512BW-NEXT: kshiftrd $5, %k6, %k4
-; AVX512BW-NEXT: kshiftlw $15, %k4, %k4
-; AVX512BW-NEXT: kshiftrw $12, %k4, %k6
+; AVX512BW-NEXT: kandw %k1, %k2, %k5
+; AVX512BW-NEXT: kshiftrd $5, %k6, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $12, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $11, %k4, %k6
+; AVX512BW-NEXT: kshiftrw $11, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $10, %k4, %k6
+; AVX512BW-NEXT: kshiftrw $10, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $9, %k4, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $9, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $8, %k4, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $8, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $7, %k4, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $7, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $6, %k4, %k4
-; AVX512BW-NEXT: korw %k4, %k5, %k4
-; AVX512BW-NEXT: kandw %k3, %k4, %k5
+; AVX512BW-NEXT: kandw %k3, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $6, %k2, %k2
+; AVX512BW-NEXT: korw %k2, %k5, %k2
+; AVX512BW-NEXT: kandw %k4, %k2, %k5
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
-; AVX512BW-NEXT: kshiftlw $15, %k3, %k4
-; AVX512BW-NEXT: kshiftrw $5, %k4, %k6
+; AVX512BW-NEXT: kshiftlw $15, %k3, %k2
+; AVX512BW-NEXT: kshiftrw $5, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kandw %k7, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $4, %k4, %k6
+; AVX512BW-NEXT: kshiftrw $4, %k2, %k6
+; AVX512BW-NEXT: korw %k6, %k5, %k5
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k5, %k5
+; AVX512BW-NEXT: kshiftrw $3, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
; AVX512BW-NEXT: kandw %k7, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $3, %k4, %k6
-; AVX512BW-NEXT: korw %k6, %k5, %k5
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k5, %k5
-; AVX512BW-NEXT: kshiftrw $2, %k4, %k6
+; AVX512BW-NEXT: kshiftrw $2, %k2, %k6
; AVX512BW-NEXT: korw %k6, %k5, %k5
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k5, %k5
-; AVX512BW-NEXT: kshiftlw $14, %k3, %k2
-; AVX512BW-NEXT: korw %k2, %k5, %k2
-; AVX512BW-NEXT: kshiftlw $1, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $1, %k2, %k2
-; AVX512BW-NEXT: korw %k4, %k2, %k2
+; AVX512BW-NEXT: kshiftlw $14, %k3, %k3
+; AVX512BW-NEXT: korw %k3, %k5, %k3
+; AVX512BW-NEXT: kshiftlw $1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $1, %k3, %k3
+; AVX512BW-NEXT: korw %k2, %k3, %k2
; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm12 {%k2} {z}
; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kshiftrw $14, %k3, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $13, %k3, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $12, %k3, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $11, %k3, %k4
-; AVX512BW-NEXT: korw %k4, %k2, %k2
-; AVX512BW-NEXT: kandw %k1, %k2, %k4
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512BW-NEXT: kshiftrd $3, %k1, %k2
+; AVX512BW-NEXT: kshiftrw $14, %k4, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $13, %k4, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $12, %k4, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k2
+; AVX512BW-NEXT: kshiftrw $11, %k4, %k3
+; AVX512BW-NEXT: korw %k3, %k2, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k2, %k3
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
+; AVX512BW-NEXT: kshiftrd $3, %k2, %k2
; AVX512BW-NEXT: kshiftlw $15, %k2, %k2
-; AVX512BW-NEXT: kshiftrw $10, %k2, %k3
-; AVX512BW-NEXT: korw %k3, %k4, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kshiftrw $10, %k2, %k4
+; AVX512BW-NEXT: korw %k4, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $9, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $8, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $7, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $6, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $5, %k2, %k4
; AVX512BW-NEXT: korw %k4, %k3, %k3
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k3, %k3
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k3, %k3
; AVX512BW-NEXT: kshiftrw $4, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k3, %k2
-; AVX512BW-NEXT: kandw %k7, %k2, %k2
+; AVX512BW-NEXT: kandw %k1, %k2, %k2
; AVX512BW-NEXT: kshiftrw $3, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k2, %k2
+; AVX512BW-NEXT: kandw %k7, %k2, %k2
; AVX512BW-NEXT: kshiftrw $2, %k0, %k3
; AVX512BW-NEXT: korw %k3, %k2, %k2
; AVX512BW-NEXT: kandw %k6, %k2, %k2
@@ -10934,95 +10938,94 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
;
; AVX512BW-LABEL: mask_replication_factor7_vf64:
; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: kmovq (%rdi), %k4
; AVX512BW-NEXT: movw $-3, %ax
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kmovw (%rdi), %k0
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $14, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovd %eax, %k0
+; AVX512BW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k0, %k4, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k4, %k1
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-5, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $13, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $13, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-9, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $12, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $12, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-17, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $11, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $11, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-33, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kmovq %k2, %k4
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $10, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $10, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-65, %ax
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k1, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $9, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: movw $-129, %ax
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq (%rdi), %k3
-; AVX512BW-NEXT: kshiftrq $1, %k3, %k0
-; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
-; AVX512BW-NEXT: kshiftrw $8, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kshiftrq $1, %k4, %k1
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $8, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-257, %ax # imm = 0xFEFF
; AVX512BW-NEXT: kmovd %eax, %k2
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovq %k2, %k5
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $7, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $7, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-513, %ax # imm = 0xFDFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $6, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $6, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-1025, %ax # imm = 0xFBFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $5, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-2049, %ax # imm = 0xF7FF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $4, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-4097, %ax # imm = 0xEFFF
; AVX512BW-NEXT: kmovd %eax, %k2
; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k2
-; AVX512BW-NEXT: korw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $3, %k1, %k3
+; AVX512BW-NEXT: korw %k3, %k0, %k0
; AVX512BW-NEXT: movw $-8193, %ax # imm = 0xDFFF
-; AVX512BW-NEXT: kmovd %eax, %k5
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k1, %k0
+; AVX512BW-NEXT: kmovd %eax, %k2
+; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: movw $-16385, %ax # imm = 0xBFFF
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512BW-NEXT: kandw %k1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $2, %k3, %k1
+; AVX512BW-NEXT: kshiftrq $2, %k4, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -11030,26 +11033,27 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k0, %k6
; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k6} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k7, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k1
-; AVX512BW-NEXT: kmovq %k3, %k7
-; AVX512BW-NEXT: kmovq %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512BW-NEXT: kshiftrq $3, %k3, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kmovq %k4, %k7
+; AVX512BW-NEXT: kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512BW-NEXT: kshiftrq $3, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
@@ -11057,24 +11061,23 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -11083,26 +11086,26 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z}
-; AVX512BW-NEXT: kandw %k2, %k6, %k1
+; AVX512BW-NEXT: kandw %k3, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $5, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
@@ -11116,108 +11119,108 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k6
-; AVX512BW-NEXT: kshiftrq $6, %k7, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
-; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
+; AVX512BW-NEXT: kshiftrq $6, %k7, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k6, %k6
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
+; AVX512BW-NEXT: kandw %k5, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
+; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k6, %k6
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k6, %k6
-; AVX512BW-NEXT: kshiftlw $14, %k1, %k7
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
-; AVX512BW-NEXT: korw %k0, %k6, %k6
-; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k6} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k6, %k1
+; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $7, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $8, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
-; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 8-byte Reload
-; AVX512BW-NEXT: kshiftrq $9, %k2, %k1
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; AVX512BW-NEXT: kshiftrq $9, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm3 {%k7} {z}
@@ -11229,146 +11232,145 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k5, %k0, %k0
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $10, %k2, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k1
+; AVX512BW-NEXT: kmovq %k5, %k4
+; AVX512BW-NEXT: kshiftrq $10, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $11, %k2, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $11, %k4, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 256(%rsi), %zmm4 {%k1} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k6, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq %k2, %k7
-; AVX512BW-NEXT: kshiftrq $12, %k2, %k0
+; AVX512BW-NEXT: kshiftrq $12, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $13, %k7, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: korw %k0, %k1, %k1
-; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-NEXT: kandw %k5, %k6, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k6
+; AVX512BW-NEXT: kshiftrq $13, %k4, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kandw %k5, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT: korw %k1, %k6, %k6
+; AVX512BW-NEXT: vmovdqa32 320(%rsi), %zmm5 {%k6} {z}
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
-; AVX512BW-NEXT: kshiftrq $14, %k5, %k0
+; AVX512BW-NEXT: kshiftrq $14, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
@@ -11379,91 +11381,94 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $15, %k5, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k6
+; AVX512BW-NEXT: kshiftrq $15, %k4, %k1
+; AVX512BW-NEXT: kmovq %k4, %k3
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 384(%rsi), %zmm6 {%k1} {z}
-; AVX512BW-NEXT: kshiftrq $16, %k5, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kmovq %k3, %k2
+; AVX512BW-NEXT: kshiftrq $16, %k3, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $17, %k5, %k0
-; AVX512BW-NEXT: kmovq %k5, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $17, %k2, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $18, %k7, %k1
+; AVX512BW-NEXT: kandw %k7, %k0, %k0
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
+; AVX512BW-NEXT: kshiftrq $18, %k4, %k1
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -11475,29 +11480,27 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
-; AVX512BW-NEXT: kshiftrq $19, %k7, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $19, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
@@ -11505,29 +11508,30 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $20, %k7, %k6
+; AVX512BW-NEXT: kshiftrq $20, %k4, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
@@ -11538,58 +11542,59 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
-; AVX512BW-NEXT: kshiftrq $21, %k7, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $21, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k6
-; AVX512BW-NEXT: kshiftrq $22, %k7, %k0
-; AVX512BW-NEXT: kmovq %k7, %k2
+; AVX512BW-NEXT: kshiftrq $22, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k6, %k6
+; AVX512BW-NEXT: kandw %k5, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kandw %k3, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
@@ -11598,54 +11603,52 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $23, %k2, %k0
+; AVX512BW-NEXT: kshiftrq $23, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $24, %k2, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $24, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
@@ -11658,7 +11661,7 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $25, %k2, %k1
+; AVX512BW-NEXT: kshiftrq $25, %k4, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm10 {%k7} {z}
@@ -11670,140 +11673,137 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k5, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq %k2, %k7
-; AVX512BW-NEXT: kshiftrq $26, %k2, %k0
+; AVX512BW-NEXT: kshiftrq $26, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $27, %k7, %k6
-; AVX512BW-NEXT: kmovq %k7, %k4
+; AVX512BW-NEXT: kshiftrq $27, %k4, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm11 {%k1} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k6, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftrq $28, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $29, %k7, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: korw %k0, %k1, %k1
-; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k1} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k6
+; AVX512BW-NEXT: kshiftrq $29, %k4, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT: korw %k1, %k6, %k6
+; AVX512BW-NEXT: vmovdqa32 768(%rsi), %zmm12 {%k6} {z}
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
-; AVX512BW-NEXT: kshiftrq $30, %k7, %k0
+; AVX512BW-NEXT: kshiftrq $30, %k4, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
@@ -11818,92 +11818,92 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $31, %k7, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT: kandw %k1, %k0, %k6
+; AVX512BW-NEXT: kshiftrq $31, %k4, %k1
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kandw %k5, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 832(%rsi), %zmm13 {%k1} {z}
-; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
-; AVX512BW-NEXT: kshiftrq $32, %k5, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $32, %k4, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $33, %k5, %k0
-; AVX512BW-NEXT: kmovq %k5, %k7
+; AVX512BW-NEXT: kshiftrq $33, %k4, %k0
+; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k0
@@ -11920,48 +11920,49 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kmovq %k3, %k7
; AVX512BW-NEXT: kshiftrq $35, %k3, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $36, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
@@ -11970,8 +11971,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
@@ -11986,96 +11987,96 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
+; AVX512BW-NEXT: kandw %k4, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $37, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k6
+; AVX512BW-NEXT: kandw %k2, %k0, %k6
; AVX512BW-NEXT: kshiftrq $38, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
; AVX512BW-NEXT: kshiftrw $5, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k6, %k6
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kandw %k3, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: vmovdqa32 1024(%rsi), %zmm16 {%k1} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $39, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kshiftrq $40, %k7, %k0
-; AVX512BW-NEXT: kmovq %k7, %k3
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
@@ -12083,37 +12084,36 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
-; AVX512BW-NEXT: kshiftrq $41, %k3, %k1
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; AVX512BW-NEXT: kshiftrq $41, %k5, %k1
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1088(%rsi), %zmm17 {%k7} {z}
-; AVX512BW-NEXT: kandw %k5, %k1, %k0
+; AVX512BW-NEXT: kandw %k3, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k0, %k0
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -12124,42 +12124,45 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $42, %k3, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k1
+; AVX512BW-NEXT: kmovq %k5, %k7
+; AVX512BW-NEXT: kshiftrq $42, %k5, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $43, %k3, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $43, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
@@ -12170,70 +12173,68 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq %k3, %k7
-; AVX512BW-NEXT: kshiftrq $44, %k3, %k0
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT: kshiftrq $44, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
+; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovq %k4, %k5
-; AVX512BW-NEXT: kandw %k4, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $45, %k7, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k6
+; AVX512BW-NEXT: kshiftrq $45, %k7, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: korw %k0, %k1, %k1
-; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k1} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k6, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k1, %k0
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT: korw %k1, %k6, %k6
+; AVX512BW-NEXT: vmovdqa32 1216(%rsi), %zmm19 {%k6} {z}
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
@@ -12241,18 +12242,18 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
@@ -12260,76 +12261,78 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $47, %k7, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
+; AVX512BW-NEXT: kandw %k5, %k0, %k6
+; AVX512BW-NEXT: kshiftrq $47, %k7, %k1
+; AVX512BW-NEXT: kmovq %k7, %k4
+; AVX512BW-NEXT: kshiftlw $15, %k1, %k0
; AVX512BW-NEXT: kshiftrw $6, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $5, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k6, %k6
-; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $14, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm20 {%k1} {z}
-; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
-; AVX512BW-NEXT: kshiftrq $48, %k5, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $48, %k4, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kandw %k3, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $49, %k5, %k0
-; AVX512BW-NEXT: kmovq %k5, %k7
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $49, %k4, %k0
+; AVX512BW-NEXT: kmovq %k4, %k7
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
@@ -12340,17 +12343,17 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kandw %k2, %k0, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k0
; AVX512BW-NEXT: kshiftrq $50, %k7, %k1
-; AVX512BW-NEXT: kmovq %k7, %k2
+; AVX512BW-NEXT: kmovq %k7, %k3
; AVX512BW-NEXT: kshiftlw $14, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -12358,40 +12361,40 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k1, %k6
; AVX512BW-NEXT: korw %k6, %k0, %k7
; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm21 {%k7} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k0, %k1, %k0
+; AVX512BW-NEXT: kandw %k2, %k1, %k0
; AVX512BW-NEXT: kshiftrw $14, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $51, %k2, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k0, %k1
+; AVX512BW-NEXT: kmovq %k3, %k7
+; AVX512BW-NEXT: kshiftrq $51, %k3, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
@@ -12399,12 +12402,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $52, %k2, %k6
+; AVX512BW-NEXT: kandw %k4, %k0, %k1
+; AVX512BW-NEXT: kshiftrq $52, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
@@ -12412,50 +12415,49 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
; AVX512BW-NEXT: korw %k0, %k1, %k1
; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm22 {%k1} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k6, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $53, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k6, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kandw %k3, %k0, %k6
@@ -12471,44 +12473,45 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k3, %k6, %k6
; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k6, %k6
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k6, %k6
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k6, %k6
; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
; AVX512BW-NEXT: korw %k1, %k6, %k1
; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm23 {%k1} {z}
-; AVX512BW-NEXT: kandw %k2, %k0, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k1
; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
; AVX512BW-NEXT: kshiftrq $55, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k4, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
@@ -12518,8 +12521,8 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kshiftlw $15, %k0, %k6
; AVX512BW-NEXT: kshiftrw $7, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
@@ -12534,11 +12537,11 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $3, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftrw $2, %k6, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
@@ -12555,40 +12558,41 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kandw %k4, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $12, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k0
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k1, %k0, %k0
; AVX512BW-NEXT: kshiftrw $10, %k6, %k1
; AVX512BW-NEXT: korw %k1, %k0, %k0
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k0, %k1
+; AVX512BW-NEXT: kandw %k4, %k0, %k1
+; AVX512BW-NEXT: kmovq %k2, %k7
; AVX512BW-NEXT: kshiftrq $58, %k2, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
@@ -12597,12 +12601,12 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $59, %k2, %k6
+; AVX512BW-NEXT: kshiftrq $59, %k7, %k6
; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
; AVX512BW-NEXT: korw %k7, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
@@ -12613,72 +12617,71 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k1, %k6, %k1
; AVX512BW-NEXT: kshiftrw $14, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k5, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $12, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kmovq %k2, %k5
-; AVX512BW-NEXT: kshiftrq $60, %k2, %k0
+; AVX512BW-NEXT: kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; AVX512BW-NEXT: kshiftrq $60, %k7, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $11, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $10, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $9, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
+; AVX512BW-NEXT: kandw %k3, %k0, %k6
+; AVX512BW-NEXT: kshiftrq $61, %k7, %k0
+; AVX512BW-NEXT: kmovq %k7, %k2
+; AVX512BW-NEXT: kshiftlw $15, %k0, %k1
+; AVX512BW-NEXT: kshiftrw $4, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kandw %k4, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $3, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k3, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $61, %k5, %k6
-; AVX512BW-NEXT: kshiftlw $15, %k6, %k0
-; AVX512BW-NEXT: kshiftrw $4, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $2, %k1, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k6, %k7
-; AVX512BW-NEXT: korw %k7, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
-; AVX512BW-NEXT: korw %k0, %k1, %k1
-; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k1} {z}
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k1, %k6, %k1
-; AVX512BW-NEXT: kshiftrw $14, %k0, %k0
-; AVX512BW-NEXT: korw %k0, %k1, %k0
+; AVX512BW-NEXT: kandw %k7, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $14, %k0, %k7
+; AVX512BW-NEXT: korw %k7, %k6, %k6
+; AVX512BW-NEXT: kshiftlw $1, %k6, %k6
+; AVX512BW-NEXT: kshiftrw $1, %k6, %k6
+; AVX512BW-NEXT: korw %k1, %k6, %k6
+; AVX512BW-NEXT: vmovdqa32 1664(%rsi), %zmm26 {%k6} {z}
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k0, %k0
+; AVX512BW-NEXT: kshiftrw $14, %k1, %k1
+; AVX512BW-NEXT: korw %k1, %k0, %k0
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftrq $62, %k5, %k0
+; AVX512BW-NEXT: kshiftrq $62, %k2, %k0
; AVX512BW-NEXT: kshiftlw $15, %k0, %k0
; AVX512BW-NEXT: kshiftrw $13, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
@@ -12702,32 +12705,32 @@ define void @mask_replication_factor7_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $8, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k6, %k1, %k1
; AVX512BW-NEXT: kshiftrw $7, %k0, %k0
; AVX512BW-NEXT: korw %k0, %k1, %k0
-; AVX512BW-NEXT: kshiftrq $63, %k5, %k5
+; AVX512BW-NEXT: kshiftrq $63, %k2, %k2
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512BW-NEXT: kandw %k1, %k0, %k1
-; AVX512BW-NEXT: kshiftlw $15, %k5, %k0
+; AVX512BW-NEXT: kshiftlw $15, %k2, %k0
; AVX512BW-NEXT: kshiftrw $6, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $5, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
-; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k5, %k1, %k1
; AVX512BW-NEXT: kshiftrw $4, %k0, %k6
; AVX512BW-NEXT: korw %k6, %k1, %k1
; AVX512BW-NEXT: kandw %k4, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $3, %k0, %k4
+; AVX512BW-NEXT: kshiftrw $3, %k0, %k5
+; AVX512BW-NEXT: korw %k5, %k1, %k1
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftrw $2, %k0, %k4
; AVX512BW-NEXT: korw %k4, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftrw $2, %k0, %k3
-; AVX512BW-NEXT: korw %k3, %k1, %k1
-; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-NEXT: kandw %k2, %k1, %k1
-; AVX512BW-NEXT: kshiftlw $14, %k5, %k2
+; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload
+; AVX512BW-NEXT: kandw %k3, %k1, %k1
+; AVX512BW-NEXT: kshiftlw $14, %k2, %k2
; AVX512BW-NEXT: korw %k2, %k1, %k1
; AVX512BW-NEXT: kshiftlw $1, %k1, %k1
; AVX512BW-NEXT: kshiftrw $1, %k1, %k1
@@ -12797,7 +12800,7 @@ define void @mask_replication_factor8_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor8_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovw (%rdi), %k1
+; AVX512BW-NEXT: kmovq (%rdi), %k1
; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
@@ -12851,7 +12854,7 @@ define void @mask_replication_factor8_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
;
; AVX512BW-LABEL: mask_replication_factor8_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovd (%rdi), %k0
+; AVX512BW-NEXT: kmovq (%rdi), %k0
; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
More information about the llvm-commits
mailing list