[llvm-branch-commits] [llvm] [SelectionDAG] Fold extracts spanning concat operands (PR #200936)
Krzysztof Drewniak via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jun 1 16:45:56 PDT 2026
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/200936
>From 352473f8739e910482aa54385d776efe99258c76 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 29 May 2026 22:43:01 +0000
Subject: [PATCH 1/2] [SelectionDAG] Fold extracts spanning concat operands
Factor the extract_subvector-of-CONCAT_VECTORS logic and handle
extracts that cover multiple whole concat operands by rebuilding a
smaller concat directly.
AI note: an LLM generated the code and the test, I've read them
Co-Authored-By: OpenAI Codex <codex at openai.com>
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 100 +-
.../AArch64/fixed-vector-interleave.ll | 148 +-
.../AArch64/sve-fixed-vector-llrint.ll | 374 ++--
.../CodeGen/AArch64/sve-fixed-vector-lrint.ll | 374 ++--
llvm/test/CodeGen/AMDGPU/bf16.ll | 1860 +++++++++--------
.../AMDGPU/dagcombine-extract-concat.ll | 49 +
llvm/test/CodeGen/X86/combine-pmuldq.ll | 16 +-
.../test/CodeGen/X86/ifma-combine-vpmadd52.ll | 24 +-
llvm/test/CodeGen/X86/madd.ll | 124 +-
llvm/test/CodeGen/X86/pmaddubsw.ll | 34 +-
.../vector-interleaved-store-i16-stride-6.ll | 282 +--
.../vector-interleaved-store-i32-stride-6.ll | 240 +--
.../vector-interleaved-store-i8-stride-6.ll | 372 ++--
.../CodeGen/X86/vector-replicaton-i1-mask.ll | 32 +-
llvm/test/CodeGen/X86/widen_fadd.ll | 116 +-
llvm/test/CodeGen/X86/widen_fdiv.ll | 25 +-
llvm/test/CodeGen/X86/widen_fmul.ll | 116 +-
llvm/test/CodeGen/X86/widen_fsub.ll | 116 +-
18 files changed, 2191 insertions(+), 2211 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/dagcombine-extract-concat.ll
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index dd74e63744f2e..58fc5ece9f3d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27545,6 +27545,70 @@ static SDValue foldExtractSubvectorFromShuffleVector(EVT NarrowVT, SDValue Src,
return DAG.getVectorShuffle(NarrowVT, DL, NewOps[0], NewOps[1], NewMask);
}
+static SDValue foldExtractSubvectorFromConcatVectors(EVT NVT, SDValue V,
+ uint64_t ExtIdx,
+ const SDLoc &DL,
+ SelectionDAG &DAG,
+ bool LegalOperations) {
+ if (V.getOpcode() != ISD::CONCAT_VECTORS)
+ return SDValue();
+
+ unsigned ExtNumElts = NVT.getVectorMinNumElements();
+ EVT ConcatSrcVT = V.getOperand(0).getValueType();
+ assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() &&
+ "Concat and extract subvector do not change element type");
+
+ unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements();
+ unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts;
+ if (ConcatOpIdx >= V.getNumOperands())
+ return SDValue();
+
+ // If the concatenated source types match this extract, it's a direct
+ // simplification:
+ // extract_subvec (concat V1, V2, ...), i --> Vi
+ if (NVT.getVectorElementCount() == ConcatSrcVT.getVectorElementCount())
+ return V.getOperand(ConcatOpIdx);
+
+ if (!NVT.isFixedLengthVector() || !ConcatSrcVT.isFixedLengthVector())
+ return SDValue();
+
+ // If the concatenated source vectors are a multiple length of this extract,
+ // then extract a fraction of one of those source vectors directly from a
+ // concat operand. Example:
+ // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y)), 14 -->
+ // v2i8 extract_subvec v8i8 Y, 6
+ if (ConcatSrcNumElts % ExtNumElts == 0) {
+ uint64_t NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
+ if (NewExtIdx + ExtNumElts > ConcatSrcNumElts)
+ return SDValue();
+ assert(NewExtIdx % ExtNumElts == 0 &&
+ "Extract index is not a multiple of the input vector length.");
+ SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
+ V.getOperand(ConcatOpIdx), NewIndexC);
+ }
+
+ // If the extract covers multiple whole concat operands, rebuild that smaller
+ // concat directly.
+ if (ExtNumElts % ConcatSrcNumElts == 0 && ExtIdx % ConcatSrcNumElts == 0) {
+ if (LegalOperations &&
+ !DAG.getTargetLoweringInfo().isOperationLegalOrCustom(
+ ISD::CONCAT_VECTORS, NVT))
+ return SDValue();
+
+ unsigned NumConcatOps = ExtNumElts / ConcatSrcNumElts;
+ if (ConcatOpIdx + NumConcatOps > V.getNumOperands())
+ return SDValue();
+
+ SmallVector<SDValue, 8> Ops;
+ for (unsigned I = 0; I != NumConcatOps; ++I)
+ Ops.push_back(V.getOperand(ConcatOpIdx + I));
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, NVT, Ops);
+ }
+
+ return SDValue();
+}
+
SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
EVT NVT = N->getValueType(0);
SDValue V = N->getOperand(0);
@@ -27653,38 +27717,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
}
}
- if (V.getOpcode() == ISD::CONCAT_VECTORS) {
- unsigned ExtNumElts = NVT.getVectorMinNumElements();
- EVT ConcatSrcVT = V.getOperand(0).getValueType();
- assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() &&
- "Concat and extract subvector do not change element type");
-
- unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements();
- unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts;
-
- // If the concatenated source types match this extract, it's a direct
- // simplification:
- // extract_subvec (concat V1, V2, ...), i --> Vi
- if (NVT.getVectorElementCount() == ConcatSrcVT.getVectorElementCount())
- return V.getOperand(ConcatOpIdx);
-
- // If the concatenated source vectors are a multiple length of this extract,
- // then extract a fraction of one of those source vectors directly from a
- // concat operand. Example:
- // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
- // v2i8 extract_subvec v8i8 Y, 6
- if (NVT.isFixedLengthVector() && ConcatSrcVT.isFixedLengthVector() &&
- ConcatSrcNumElts % ExtNumElts == 0) {
- unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
- assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
- "Trying to extract from >1 concat operand?");
- assert(NewExtIdx % ExtNumElts == 0 &&
- "Extract index is not a multiple of the input vector length.");
- SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
- V.getOperand(ConcatOpIdx), NewIndexC);
- }
- }
+ if (SDValue Folded =
+ foldExtractSubvectorFromConcatVectors(NVT, V, ExtIdx, DL, DAG,
+ LegalOperations))
+ return Folded;
if (SDValue Shuffle = foldExtractSubvectorFromShuffleVector(
NVT, V, ExtIdx, DL, DAG, LegalOperations))
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
index 4ac0276aabfec..2d368c0ec456f 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-interleave.ll
@@ -209,12 +209,12 @@ define <4 x i16> @interleave2_diff_nonconst_splat_v4i16(i16 %a, i16 %b) {
define <32 x i8> @interleave4_v32i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, <8 x i8> %vec3) {
; CHECK-LABEL: interleave4_v32i8:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v4.8b, v1.8b, v3.8b
-; CHECK-NEXT: zip1 v5.8b, v0.8b, v2.8b
-; CHECK-NEXT: zip2 v1.8b, v1.8b, v3.8b
-; CHECK-NEXT: zip2 v2.8b, v0.8b, v2.8b
-; CHECK-NEXT: zip1 v0.16b, v5.16b, v4.16b
-; CHECK-NEXT: zip1 v1.16b, v2.16b, v1.16b
+; CHECK-NEXT: zip2 v4.8b, v1.8b, v3.8b
+; CHECK-NEXT: zip2 v5.8b, v0.8b, v2.8b
+; CHECK-NEXT: zip1 v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: zip1 v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: zip1 v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: zip1 v1.16b, v5.16b, v4.16b
; CHECK-NEXT: ret
%retval = call <32 x i8> @llvm.vector.interleave4.v32i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, <8 x i8> %vec3)
ret <32 x i8> %retval
@@ -239,12 +239,12 @@ define <64 x i8> @interleave4_v64i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8>
define <16 x i16> @interleave4_v16i16(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2, <4 x i16> %vec3) {
; CHECK-LABEL: interleave4_v16i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v4.4h, v1.4h, v3.4h
-; CHECK-NEXT: zip1 v5.4h, v0.4h, v2.4h
-; CHECK-NEXT: zip2 v1.4h, v1.4h, v3.4h
-; CHECK-NEXT: zip2 v2.4h, v0.4h, v2.4h
-; CHECK-NEXT: zip1 v0.8h, v5.8h, v4.8h
-; CHECK-NEXT: zip1 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT: zip2 v4.4h, v1.4h, v3.4h
+; CHECK-NEXT: zip2 v5.4h, v0.4h, v2.4h
+; CHECK-NEXT: zip1 v0.4h, v0.4h, v2.4h
+; CHECK-NEXT: zip1 v1.4h, v1.4h, v3.4h
+; CHECK-NEXT: zip1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: zip1 v1.8h, v5.8h, v4.8h
; CHECK-NEXT: ret
%retval = call <16 x i16> @llvm.vector.interleave4.v16i16(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2, <4 x i16> %vec3)
ret <16 x i16> %retval
@@ -269,12 +269,12 @@ define <32 x i16> @interleave4_v32i16(<8 x i16> %vec0, <8 x i16> %vec1, <8 x i16
define <8 x i32> @interleave4_v8i32(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> %vec3) {
; CHECK-LABEL: interleave4_v8i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v4.2s, v1.2s, v3.2s
-; CHECK-NEXT: zip1 v5.2s, v0.2s, v2.2s
-; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s
-; CHECK-NEXT: zip2 v2.2s, v0.2s, v2.2s
-; CHECK-NEXT: zip1 v0.4s, v5.4s, v4.4s
-; CHECK-NEXT: zip1 v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: zip2 v4.2s, v1.2s, v3.2s
+; CHECK-NEXT: zip2 v5.2s, v0.2s, v2.2s
+; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s
+; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: zip1 v1.4s, v5.4s, v4.4s
; CHECK-NEXT: ret
%retval = call <8 x i32> @llvm.vector.interleave4.v8i32(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> %vec3)
ret <8 x i32> %retval
@@ -330,12 +330,12 @@ define <8 x i64> @interleave4_v8i64(<2 x i64> %vec0, <2 x i64> %vec1, <2 x i64>
define <16 x half> @interleave4_v16f16(<4 x half> %vec0, <4 x half> %vec1, <4 x half> %vec2, <4 x half> %vec3) {
; CHECK-LABEL: interleave4_v16f16:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v4.4h, v1.4h, v3.4h
-; CHECK-NEXT: zip1 v5.4h, v0.4h, v2.4h
-; CHECK-NEXT: zip2 v1.4h, v1.4h, v3.4h
-; CHECK-NEXT: zip2 v2.4h, v0.4h, v2.4h
-; CHECK-NEXT: zip1 v0.8h, v5.8h, v4.8h
-; CHECK-NEXT: zip1 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT: zip2 v4.4h, v1.4h, v3.4h
+; CHECK-NEXT: zip2 v5.4h, v0.4h, v2.4h
+; CHECK-NEXT: zip1 v0.4h, v0.4h, v2.4h
+; CHECK-NEXT: zip1 v1.4h, v1.4h, v3.4h
+; CHECK-NEXT: zip1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: zip1 v1.8h, v5.8h, v4.8h
; CHECK-NEXT: ret
%retval = call <16 x half> @llvm.vector.interleave4.v16f16(<4 x half> %vec0, <4 x half> %vec1, <4 x half> %vec2, <4 x half> %vec3)
ret <16 x half> %retval
@@ -360,12 +360,12 @@ define <32 x half> @interleave4_v32f16(<8 x half> %vec0, <8 x half> %vec1, <8 x
define <8 x float> @interleave4_v8f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %vec2, <2 x float> %vec3) {
; CHECK-LABEL: interleave4_v8f32:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v4.2s, v1.2s, v3.2s
-; CHECK-NEXT: zip1 v5.2s, v0.2s, v2.2s
-; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s
-; CHECK-NEXT: zip2 v2.2s, v0.2s, v2.2s
-; CHECK-NEXT: zip1 v0.4s, v5.4s, v4.4s
-; CHECK-NEXT: zip1 v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: zip2 v4.2s, v1.2s, v3.2s
+; CHECK-NEXT: zip2 v5.2s, v0.2s, v2.2s
+; CHECK-NEXT: zip1 v0.2s, v0.2s, v2.2s
+; CHECK-NEXT: zip1 v1.2s, v1.2s, v3.2s
+; CHECK-NEXT: zip1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: zip1 v1.4s, v5.4s, v4.4s
; CHECK-NEXT: ret
%retval = call <8 x float> @llvm.vector.interleave4.v8f32(<2 x float> %vec0, <2 x float> %vec1, <2 x float> %vec2, <2 x float> %vec3)
ret <8 x float> %retval
@@ -421,12 +421,12 @@ define <8 x double> @interleave4_v8f64(<2 x double> %vec0, <2 x double> %vec1, <
define <16 x bfloat> @interleave4_v16bf16(<4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x bfloat> %vec2, <4 x bfloat> %vec3) {
; CHECK-LABEL: interleave4_v16bf16:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v4.4h, v1.4h, v3.4h
-; CHECK-NEXT: zip1 v5.4h, v0.4h, v2.4h
-; CHECK-NEXT: zip2 v1.4h, v1.4h, v3.4h
-; CHECK-NEXT: zip2 v2.4h, v0.4h, v2.4h
-; CHECK-NEXT: zip1 v0.8h, v5.8h, v4.8h
-; CHECK-NEXT: zip1 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT: zip2 v4.4h, v1.4h, v3.4h
+; CHECK-NEXT: zip2 v5.4h, v0.4h, v2.4h
+; CHECK-NEXT: zip1 v0.4h, v0.4h, v2.4h
+; CHECK-NEXT: zip1 v1.4h, v1.4h, v3.4h
+; CHECK-NEXT: zip1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: zip1 v1.8h, v5.8h, v4.8h
; CHECK-NEXT: ret
%retval = call <16 x bfloat> @llvm.vector.interleave4.v16bf16(<4 x bfloat> %vec0, <4 x bfloat> %vec1, <4 x bfloat> %vec2, <4 x bfloat> %vec3)
ret <16 x bfloat> %retval
@@ -761,26 +761,26 @@ define <16 x double> @interleave8_v16f64(<2 x double> %vec0, <2 x double> %vec1,
define <16 x i32> @interleave8_v16i32(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> %vec3, <2 x i32> %vec4, <2 x i32> %vec5, <2 x i32> %vec6, <2 x i32> %vec7) {
; CHECK-LABEL: interleave8_v16i32:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v16.2s, v3.2s, v7.2s
-; CHECK-NEXT: zip1 v17.2s, v1.2s, v5.2s
-; CHECK-NEXT: zip1 v18.2s, v2.2s, v6.2s
-; CHECK-NEXT: zip1 v19.2s, v0.2s, v4.2s
-; CHECK-NEXT: zip2 v3.2s, v3.2s, v7.2s
-; CHECK-NEXT: zip2 v1.2s, v1.2s, v5.2s
-; CHECK-NEXT: zip2 v2.2s, v2.2s, v6.2s
-; CHECK-NEXT: zip2 v0.2s, v0.2s, v4.2s
-; CHECK-NEXT: zip1 v4.2s, v17.2s, v16.2s
-; CHECK-NEXT: zip2 v6.2s, v17.2s, v16.2s
-; CHECK-NEXT: zip1 v5.2s, v19.2s, v18.2s
-; CHECK-NEXT: zip2 v7.2s, v19.2s, v18.2s
+; CHECK-NEXT: zip2 v16.2s, v2.2s, v6.2s
+; CHECK-NEXT: zip2 v17.2s, v0.2s, v4.2s
+; CHECK-NEXT: zip2 v18.2s, v3.2s, v7.2s
+; CHECK-NEXT: zip2 v19.2s, v1.2s, v5.2s
+; CHECK-NEXT: zip1 v2.2s, v2.2s, v6.2s
+; CHECK-NEXT: zip1 v0.2s, v0.2s, v4.2s
+; CHECK-NEXT: zip1 v3.2s, v3.2s, v7.2s
+; CHECK-NEXT: zip1 v1.2s, v1.2s, v5.2s
+; CHECK-NEXT: zip2 v4.2s, v17.2s, v16.2s
+; CHECK-NEXT: zip1 v6.2s, v17.2s, v16.2s
+; CHECK-NEXT: zip2 v5.2s, v19.2s, v18.2s
+; CHECK-NEXT: zip1 v7.2s, v0.2s, v2.2s
+; CHECK-NEXT: zip1 v17.2s, v19.2s, v18.2s
; CHECK-NEXT: zip1 v16.2s, v1.2s, v3.2s
-; CHECK-NEXT: zip1 v17.2s, v0.2s, v2.2s
-; CHECK-NEXT: zip2 v3.2s, v1.2s, v3.2s
-; CHECK-NEXT: zip2 v18.2s, v0.2s, v2.2s
-; CHECK-NEXT: zip1 v0.4s, v5.4s, v4.4s
-; CHECK-NEXT: zip1 v1.4s, v7.4s, v6.4s
-; CHECK-NEXT: zip1 v2.4s, v17.4s, v16.4s
-; CHECK-NEXT: zip1 v3.4s, v18.4s, v3.4s
+; CHECK-NEXT: zip2 v2.2s, v0.2s, v2.2s
+; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s
+; CHECK-NEXT: zip1 v3.4s, v4.4s, v5.4s
+; CHECK-NEXT: zip1 v0.4s, v7.4s, v16.4s
+; CHECK-NEXT: zip1 v1.4s, v2.4s, v1.4s
+; CHECK-NEXT: zip1 v2.4s, v6.4s, v17.4s
; CHECK-NEXT: ret
%retval = call <16 x i32> @llvm.vector.interleave8.v16i32(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2, <2 x i32> %vec3, <2 x i32> %vec4, <2 x i32> %vec5, <2 x i32> %vec6, <2 x i32> %vec7)
ret <16 x i32> %retval
@@ -789,26 +789,26 @@ define <16 x i32> @interleave8_v16i32(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32
define <32 x i16> @interleave8_v32i16(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2, <4 x i16> %vec3, <4 x i16> %vec4, <4 x i16> %vec5, <4 x i16> %vec6, <4 x i16> %vec7) {
; CHECK-LABEL: interleave8_v32i16:
; CHECK: // %bb.0:
-; CHECK-NEXT: zip1 v16.4h, v3.4h, v7.4h
-; CHECK-NEXT: zip1 v17.4h, v1.4h, v5.4h
-; CHECK-NEXT: zip1 v18.4h, v2.4h, v6.4h
-; CHECK-NEXT: zip1 v19.4h, v0.4h, v4.4h
-; CHECK-NEXT: zip2 v3.4h, v3.4h, v7.4h
-; CHECK-NEXT: zip2 v1.4h, v1.4h, v5.4h
-; CHECK-NEXT: zip2 v2.4h, v2.4h, v6.4h
-; CHECK-NEXT: zip2 v0.4h, v0.4h, v4.4h
-; CHECK-NEXT: zip1 v4.4h, v17.4h, v16.4h
-; CHECK-NEXT: zip2 v6.4h, v17.4h, v16.4h
-; CHECK-NEXT: zip1 v5.4h, v19.4h, v18.4h
-; CHECK-NEXT: zip2 v7.4h, v19.4h, v18.4h
+; CHECK-NEXT: zip2 v16.4h, v2.4h, v6.4h
+; CHECK-NEXT: zip2 v17.4h, v0.4h, v4.4h
+; CHECK-NEXT: zip2 v18.4h, v3.4h, v7.4h
+; CHECK-NEXT: zip2 v19.4h, v1.4h, v5.4h
+; CHECK-NEXT: zip1 v2.4h, v2.4h, v6.4h
+; CHECK-NEXT: zip1 v0.4h, v0.4h, v4.4h
+; CHECK-NEXT: zip1 v3.4h, v3.4h, v7.4h
+; CHECK-NEXT: zip1 v1.4h, v1.4h, v5.4h
+; CHECK-NEXT: zip2 v4.4h, v17.4h, v16.4h
+; CHECK-NEXT: zip1 v6.4h, v17.4h, v16.4h
+; CHECK-NEXT: zip2 v5.4h, v19.4h, v18.4h
+; CHECK-NEXT: zip1 v7.4h, v0.4h, v2.4h
+; CHECK-NEXT: zip1 v17.4h, v19.4h, v18.4h
; CHECK-NEXT: zip1 v16.4h, v1.4h, v3.4h
-; CHECK-NEXT: zip1 v17.4h, v0.4h, v2.4h
-; CHECK-NEXT: zip2 v3.4h, v1.4h, v3.4h
-; CHECK-NEXT: zip2 v18.4h, v0.4h, v2.4h
-; CHECK-NEXT: zip1 v0.8h, v5.8h, v4.8h
-; CHECK-NEXT: zip1 v1.8h, v7.8h, v6.8h
-; CHECK-NEXT: zip1 v2.8h, v17.8h, v16.8h
-; CHECK-NEXT: zip1 v3.8h, v18.8h, v3.8h
+; CHECK-NEXT: zip2 v2.4h, v0.4h, v2.4h
+; CHECK-NEXT: zip2 v1.4h, v1.4h, v3.4h
+; CHECK-NEXT: zip1 v3.8h, v4.8h, v5.8h
+; CHECK-NEXT: zip1 v0.8h, v7.8h, v16.8h
+; CHECK-NEXT: zip1 v1.8h, v2.8h, v1.8h
+; CHECK-NEXT: zip1 v2.8h, v6.8h, v17.8h
; CHECK-NEXT: ret
%retval = call <32 x i16> @llvm.vector.interleave8.v32i16(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2, <4 x i16> %vec3, <4 x i16> %vec4, <4 x i16> %vec5, <4 x i16> %vec6, <4 x i16> %vec7)
ret <32 x i16> %retval
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll
index 88c0b34366809..fe4219c99b551 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-llrint.ll
@@ -874,42 +874,34 @@ declare <2 x i64> @llvm.llrint.v2i64.v2fp128(<2 x fp128>)
define <4 x i64> @llrint_v4i64_v4fp128(<4 x fp128> %x) nounwind {
; CHECK-LABEL: llrint_v4i64_v4fp128:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: sub sp, sp, #64
-; CHECK-NEXT: addvl sp, sp, #-1
-; CHECK-NEXT: str q0, [sp, #48] // 16-byte Spill
-; CHECK-NEXT: mov v0.16b, v3.16b
-; CHECK-NEXT: stp q2, q1, [sp, #16] // 32-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #80
+; CHECK-NEXT: str x30, [sp, #48] // 8-byte Spill
+; CHECK-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT: stp q3, q2, [sp] // 32-byte Folded Spill
+; CHECK-NEXT: str q1, [sp, #32] // 16-byte Spill
+; CHECK-NEXT: bl llrintl
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp] // 16-byte Spill
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Reload
+; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #64
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT: fmov d1, x20
+; CHECK-NEXT: str q0, [sp] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x19
+; CHECK-NEXT: str q0, [sp, #16] // 16-byte Spill
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Reload
+; CHECK-NEXT: str q1, [sp, #32] // 16-byte Spill
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #32] // 16-byte Spill
-; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Reload
-; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #64
-; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload
-; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT: movprfx z1, z0
-; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
-; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: add sp, sp, #64
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldp q0, q4, [sp, #16] // 32-byte Folded Reload
+; CHECK-NEXT: fmov d2, x0
+; CHECK-NEXT: ldr q1, [sp] // 16-byte Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Reload
+; CHECK-NEXT: mov v1.d[1], v4.d[0]
+; CHECK-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-NEXT: add sp, sp, #80
; CHECK-NEXT: ret
%a = call <4 x i64> @llvm.llrint.v4i64.v4fp128(<4 x fp128> %x)
ret <4 x i64> %a
@@ -919,74 +911,64 @@ declare <4 x i64> @llvm.llrint.v4i64.v4fp128(<4 x fp128>)
define <8 x i64> @llrint_v8i64_v8fp128(<8 x fp128> %x) nounwind {
; CHECK-LABEL: llrint_v8i64_v8fp128:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: sub sp, sp, #128
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str q0, [sp, #112] // 16-byte Spill
-; CHECK-NEXT: mov v0.16b, v7.16b
-; CHECK-NEXT: stp q6, q5, [sp, #16] // 32-byte Folded Spill
-; CHECK-NEXT: stp q4, q3, [sp, #48] // 32-byte Folded Spill
-; CHECK-NEXT: stp q2, q1, [sp, #80] // 32-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #176
+; CHECK-NEXT: str x30, [sp, #112] // 8-byte Spill
+; CHECK-NEXT: stp x24, x23, [sp, #128] // 16-byte Folded Spill
+; CHECK-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill
+; CHECK-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill
+; CHECK-NEXT: stp q3, q2, [sp] // 32-byte Folded Spill
+; CHECK-NEXT: stp q5, q4, [sp, #32] // 32-byte Folded Spill
+; CHECK-NEXT: stp q7, q6, [sp, #64] // 32-byte Folded Spill
+; CHECK-NEXT: str q1, [sp, #96] // 16-byte Spill
+; CHECK-NEXT: bl llrintl
+; CHECK-NEXT: ldr q0, [sp] // 16-byte Reload
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp] // 16-byte Spill
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Reload
+; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #128
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Reload
+; CHECK-NEXT: mov x21, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #32] // 16-byte Spill
; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Reload
+; CHECK-NEXT: mov x22, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #128
-; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Reload
+; CHECK-NEXT: mov x23, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #64] // 16-byte Spill
; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Reload
+; CHECK-NEXT: mov x24, x0
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #128
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
+; CHECK-NEXT: fmov d1, x24
+; CHECK-NEXT: str q0, [sp, #80] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x23
+; CHECK-NEXT: str q0, [sp, #64] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x21
+; CHECK-NEXT: str q0, [sp, #48] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x19
+; CHECK-NEXT: str q0, [sp] // 16-byte Spill
; CHECK-NEXT: ldr q0, [sp, #96] // 16-byte Reload
+; CHECK-NEXT: str q1, [sp, #96] // 16-byte Spill
+; CHECK-NEXT: fmov d1, x22
+; CHECK-NEXT: str q1, [sp, #32] // 16-byte Spill
+; CHECK-NEXT: fmov d1, x20
+; CHECK-NEXT: str q1, [sp, #16] // 16-byte Spill
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #96] // 16-byte Spill
-; CHECK-NEXT: ldr q0, [sp, #112] // 16-byte Reload
-; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #128
-; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldr z2, [x8, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: fmov d1, x0
+; CHECK-NEXT: ldp q0, q2, [sp] // 32-byte Folded Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT: ldr x30, [sp, #112] // 8-byte Reload
+; CHECK-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload
; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload
-; CHECK-NEXT: movprfx z3, z2
-; CHECK-NEXT: ext z3.b, z3.b, z2.b, #16
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2
-; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $z3
-; CHECK-NEXT: movprfx z1, z0
-; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: add sp, sp, #128
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldp q3, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-NEXT: ldp x24, x23, [sp, #128] // 16-byte Folded Reload
+; CHECK-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-NEXT: ldr q2, [sp, #64] // 16-byte Reload
+; CHECK-NEXT: mov v2.d[1], v3.d[0]
+; CHECK-NEXT: ldp q3, q4, [sp, #80] // 32-byte Folded Reload
+; CHECK-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-NEXT: add sp, sp, #176
; CHECK-NEXT: ret
%a = call <8 x i64> @llvm.llrint.v8i64.v8fp128(<8 x fp128> %x)
ret <8 x i64> %a
@@ -996,158 +978,134 @@ declare <8 x i64> @llvm.llrint.v8i64.v8fp128(<8 x fp128>)
define <16 x i64> @llrint_v16fp128(<16 x fp128> %x) nounwind {
; CHECK-LABEL: llrint_v16fp128:
; CHECK: // %bb.0:
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: sub sp, sp, #256
-; CHECK-NEXT: addvl sp, sp, #-4
-; CHECK-NEXT: addvl x8, sp, #4
-; CHECK-NEXT: str q1, [sp, #240] // 16-byte Spill
-; CHECK-NEXT: ldr q1, [x8, #272]
-; CHECK-NEXT: addvl x8, sp, #4
-; CHECK-NEXT: str q0, [sp, #224] // 16-byte Spill
-; CHECK-NEXT: stp q7, q6, [sp, #128] // 32-byte Folded Spill
-; CHECK-NEXT: str q1, [sp, #112] // 16-byte Spill
-; CHECK-NEXT: ldr q1, [x8, #288]
-; CHECK-NEXT: addvl x8, sp, #4
-; CHECK-NEXT: stp q5, q4, [sp, #160] // 32-byte Folded Spill
-; CHECK-NEXT: str q1, [sp, #96] // 16-byte Spill
-; CHECK-NEXT: ldr q1, [x8, #304]
-; CHECK-NEXT: addvl x8, sp, #4
-; CHECK-NEXT: stp q3, q2, [sp, #192] // 32-byte Folded Spill
-; CHECK-NEXT: str q1, [sp, #80] // 16-byte Spill
-; CHECK-NEXT: ldr q1, [x8, #320]
-; CHECK-NEXT: addvl x8, sp, #4
-; CHECK-NEXT: str q1, [sp, #64] // 16-byte Spill
-; CHECK-NEXT: ldr q1, [x8, #336]
-; CHECK-NEXT: addvl x8, sp, #4
+; CHECK-NEXT: sub sp, sp, #368
+; CHECK-NEXT: stp q3, q1, [sp, #240] // 32-byte Folded Spill
+; CHECK-NEXT: ldr q1, [sp, #464]
+; CHECK-NEXT: stp x29, x30, [sp, #272] // 16-byte Folded Spill
+; CHECK-NEXT: stp q2, q4, [sp, #16] // 32-byte Folded Spill
+; CHECK-NEXT: stp q1, q5, [sp, #192] // 32-byte Folded Spill
+; CHECK-NEXT: ldr q1, [sp, #480]
+; CHECK-NEXT: stp x28, x27, [sp, #288] // 16-byte Folded Spill
+; CHECK-NEXT: str q1, [sp, #176] // 16-byte Spill
+; CHECK-NEXT: ldr q1, [sp, #432]
+; CHECK-NEXT: stp x26, x25, [sp, #304] // 16-byte Folded Spill
+; CHECK-NEXT: str q1, [sp, #160] // 16-byte Spill
+; CHECK-NEXT: ldr q1, [sp, #448]
+; CHECK-NEXT: stp x24, x23, [sp, #320] // 16-byte Folded Spill
+; CHECK-NEXT: str q1, [sp, #144] // 16-byte Spill
+; CHECK-NEXT: ldr q1, [sp, #400]
+; CHECK-NEXT: stp x22, x21, [sp, #336] // 16-byte Folded Spill
+; CHECK-NEXT: str q1, [sp, #128] // 16-byte Spill
+; CHECK-NEXT: ldr q1, [sp, #416]
+; CHECK-NEXT: stp x20, x19, [sp, #352] // 16-byte Folded Spill
+; CHECK-NEXT: stp q6, q1, [sp, #96] // 32-byte Folded Spill
+; CHECK-NEXT: ldr q1, [sp, #368]
+; CHECK-NEXT: stp q7, q1, [sp, #64] // 32-byte Folded Spill
+; CHECK-NEXT: ldr q1, [sp, #384]
; CHECK-NEXT: str q1, [sp, #48] // 16-byte Spill
-; CHECK-NEXT: ldr q1, [x8, #352]
-; CHECK-NEXT: addvl x8, sp, #4
-; CHECK-NEXT: str q1, [sp, #32] // 16-byte Spill
-; CHECK-NEXT: ldr q1, [x8, #368]
-; CHECK-NEXT: addvl x8, sp, #4
-; CHECK-NEXT: str q1, [sp, #16] // 16-byte Spill
-; CHECK-NEXT: ldr q1, [x8, #384]
-; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp] // 16-byte Spill
+; CHECK-NEXT: ldr q0, [sp, #240] // 16-byte Reload
+; CHECK-NEXT: str x0, [sp, #224] // 8-byte Spill
+; CHECK-NEXT: bl llrintl
; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Reload
+; CHECK-NEXT: str x0, [sp, #240] // 8-byte Spill
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #256
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Reload
+; CHECK-NEXT: ldr q0, [sp, #208] // 16-byte Reload
+; CHECK-NEXT: mov x22, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #32] // 16-byte Spill
-; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Reload
+; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Reload
+; CHECK-NEXT: str x0, [sp, #208] // 8-byte Spill
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #32] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #256
-; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ldr z1, [x8, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #64] // 16-byte Reload
+; CHECK-NEXT: mov x24, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #64] // 16-byte Spill
-; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Reload
-; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #64] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #256
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #96] // 16-byte Reload
+; CHECK-NEXT: mov x23, x0
+; CHECK-NEXT: bl llrintl
+; CHECK-NEXT: ldr q0, [sp, #48] // 16-byte Reload
+; CHECK-NEXT: mov x25, x0
+; CHECK-NEXT: bl llrintl
+; CHECK-NEXT: ldr q0, [sp, #80] // 16-byte Reload
+; CHECK-NEXT: mov x26, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #96] // 16-byte Spill
; CHECK-NEXT: ldr q0, [sp, #112] // 16-byte Reload
+; CHECK-NEXT: mov x27, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #96] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #256
-; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ldr z1, [x8, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #128] // 16-byte Reload
+; CHECK-NEXT: mov x28, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #128] // 16-byte Spill
; CHECK-NEXT: ldr q0, [sp, #144] // 16-byte Reload
+; CHECK-NEXT: mov x29, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #128] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #256
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #160] // 16-byte Reload
+; CHECK-NEXT: mov x19, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #160] // 16-byte Spill
; CHECK-NEXT: ldr q0, [sp, #176] // 16-byte Reload
+; CHECK-NEXT: mov x20, x0
; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #160] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #256
-; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-NEXT: ldr q0, [sp, #192] // 16-byte Reload
+; CHECK-NEXT: mov x21, x0
; CHECK-NEXT: bl llrintl
; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: ldr x8, [sp, #224] // 8-byte Reload
+; CHECK-NEXT: fmov d1, x23
; CHECK-NEXT: str q0, [sp, #192] // 16-byte Spill
-; CHECK-NEXT: ldr q0, [sp, #208] // 16-byte Reload
-; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #192] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #256
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-NEXT: ldr q0, [sp, #240] // 16-byte Reload
+; CHECK-NEXT: fmov d0, x21
+; CHECK-NEXT: str q0, [sp, #176] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x20
+; CHECK-NEXT: str q0, [sp, #160] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x19
+; CHECK-NEXT: str q0, [sp, #144] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x29
+; CHECK-NEXT: str q0, [sp, #128] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x27
+; CHECK-NEXT: str q0, [sp, #112] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x25
+; CHECK-NEXT: str q0, [sp, #96] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x24
+; CHECK-NEXT: str q0, [sp, #48] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x22
+; CHECK-NEXT: str q0, [sp, #32] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: str q0, [sp, #80] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x28
+; CHECK-NEXT: str q0, [sp, #224] // 16-byte Spill
+; CHECK-NEXT: fmov d0, x26
+; CHECK-NEXT: str q0, [sp, #64] // 16-byte Spill
+; CHECK-NEXT: ldr q0, [sp, #256] // 16-byte Reload
+; CHECK-NEXT: str q1, [sp, #256] // 16-byte Spill
+; CHECK-NEXT: ldr d1, [sp, #208] // 8-byte Reload
+; CHECK-NEXT: str q1, [sp, #208] // 16-byte Spill
+; CHECK-NEXT: ldr d1, [sp, #240] // 8-byte Reload
+; CHECK-NEXT: str q1, [sp, #240] // 16-byte Spill
; CHECK-NEXT: bl llrintl
+; CHECK-NEXT: ldp q1, q2, [sp, #32] // 32-byte Folded Reload
+; CHECK-NEXT: ldr q5, [sp, #64] // 16-byte Reload
+; CHECK-NEXT: ldp q0, q4, [sp, #240] // 32-byte Folded Reload
+; CHECK-NEXT: ldp q16, q3, [sp, #80] // 32-byte Folded Reload
+; CHECK-NEXT: ldp x20, x19, [sp, #352] // 16-byte Folded Reload
+; CHECK-NEXT: mov v1.d[1], v0.d[0]
+; CHECK-NEXT: ldp q7, q0, [sp, #192] // 32-byte Folded Reload
+; CHECK-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-NEXT: ldr q4, [sp, #112] // 16-byte Reload
+; CHECK-NEXT: ldp x22, x21, [sp, #336] // 16-byte Folded Reload
+; CHECK-NEXT: mov v2.d[1], v0.d[0]
; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: str q0, [sp, #240] // 16-byte Spill
+; CHECK-NEXT: mov v4.d[1], v5.d[0]
+; CHECK-NEXT: ldr q5, [sp, #128] // 16-byte Reload
+; CHECK-NEXT: ldp x24, x23, [sp, #320] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x26, x25, [sp, #304] // 16-byte Folded Reload
+; CHECK-NEXT: mov v16.d[1], v0.d[0]
; CHECK-NEXT: ldr q0, [sp, #224] // 16-byte Reload
-; CHECK-NEXT: bl llrintl
-; CHECK-NEXT: fmov d0, x0
-; CHECK-NEXT: ldr q1, [sp, #240] // 16-byte Reload
-; CHECK-NEXT: add x8, sp, #256
-; CHECK-NEXT: ptrue p0.d, vl2
-; CHECK-NEXT: ldr z2, [x8, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z4, [x8, #2, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr z6, [x8, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-NEXT: ldr z1, [x8] // 16-byte Folded Reload
-; CHECK-NEXT: movprfx z3, z2
-; CHECK-NEXT: ext z3.b, z3.b, z2.b, #16
-; CHECK-NEXT: movprfx z5, z4
-; CHECK-NEXT: ext z5.b, z5.b, z4.b, #16
-; CHECK-NEXT: // kill: def $q2 killed $q2 killed $z2
-; CHECK-NEXT: // kill: def $q4 killed $q4 killed $z4
-; CHECK-NEXT: movprfx z7, z6
-; CHECK-NEXT: ext z7.b, z7.b, z6.b, #16
-; CHECK-NEXT: // kill: def $q6 killed $q6 killed $z6
-; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT: // kill: def $q3 killed $q3 killed $z3
-; CHECK-NEXT: // kill: def $q5 killed $q5 killed $z5
-; CHECK-NEXT: // kill: def $q7 killed $q7 killed $z7
-; CHECK-NEXT: movprfx z1, z0
-; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16
-; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-NEXT: // kill: def $q1 killed $q1 killed $z1
-; CHECK-NEXT: addvl sp, sp, #4
-; CHECK-NEXT: add sp, sp, #256
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ldp x28, x27, [sp, #288] // 16-byte Folded Reload
+; CHECK-NEXT: mov v5.d[1], v0.d[0]
+; CHECK-NEXT: ldp q0, q6, [sp, #144] // 32-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp, #272] // 16-byte Folded Reload
+; CHECK-NEXT: mov v6.d[1], v0.d[0]
+; CHECK-NEXT: ldr q0, [sp, #176] // 16-byte Reload
+; CHECK-NEXT: mov v7.d[1], v0.d[0]
+; CHECK-NEXT: mov v0.16b, v16.16b
+; CHECK-NEXT: add sp, sp, #368
; CHECK-NEXT: ret
%a = call <16 x i64> @llvm.llrint.v16i64.v16fp128(<16 x fp128> %x)
ret <16 x i64> %a
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll
index c9553388a5d4e..9b6aab6d33db7 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-vector-lrint.ll
@@ -1631,42 +1631,34 @@ define <4 x iXLen> @lrint_v4fp128(<4 x fp128> %x) nounwind {
;
; CHECK-i64-LABEL: lrint_v4fp128:
; CHECK-i64: // %bb.0:
-; CHECK-i64-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-i64-NEXT: sub sp, sp, #64
-; CHECK-i64-NEXT: addvl sp, sp, #-1
-; CHECK-i64-NEXT: str q0, [sp, #48] // 16-byte Spill
-; CHECK-i64-NEXT: mov v0.16b, v3.16b
-; CHECK-i64-NEXT: stp q2, q1, [sp, #16] // 32-byte Folded Spill
+; CHECK-i64-NEXT: sub sp, sp, #80
+; CHECK-i64-NEXT: str x30, [sp, #48] // 8-byte Spill
+; CHECK-i64-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-i64-NEXT: stp q3, q2, [sp] // 32-byte Folded Spill
+; CHECK-i64-NEXT: str q1, [sp, #32] // 16-byte Spill
+; CHECK-i64-NEXT: bl lrintl
+; CHECK-i64-NEXT: ldr q0, [sp] // 16-byte Reload
+; CHECK-i64-NEXT: mov x19, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp] // 16-byte Spill
; CHECK-i64-NEXT: ldr q0, [sp, #16] // 16-byte Reload
+; CHECK-i64-NEXT: mov x20, x0
; CHECK-i64-NEXT: bl lrintl
; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #64
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: str z0, [x8] // 16-byte Folded Spill
+; CHECK-i64-NEXT: fmov d1, x20
+; CHECK-i64-NEXT: str q0, [sp] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x19
+; CHECK-i64-NEXT: str q0, [sp, #16] // 16-byte Spill
; CHECK-i64-NEXT: ldr q0, [sp, #32] // 16-byte Reload
+; CHECK-i64-NEXT: str q1, [sp, #32] // 16-byte Spill
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #32] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q0, [sp, #48] // 16-byte Reload
-; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #32] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #64
-; CHECK-i64-NEXT: ptrue p0.d, vl2
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: ldr z1, [x8] // 16-byte Folded Reload
-; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-i64-NEXT: movprfx z1, z0
-; CHECK-i64-NEXT: ext z1.b, z1.b, z0.b, #16
-; CHECK-i64-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-i64-NEXT: // kill: def $q1 killed $q1 killed $z1
-; CHECK-i64-NEXT: addvl sp, sp, #1
-; CHECK-i64-NEXT: add sp, sp, #64
-; CHECK-i64-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-i64-NEXT: ldp q0, q4, [sp, #16] // 32-byte Folded Reload
+; CHECK-i64-NEXT: fmov d2, x0
+; CHECK-i64-NEXT: ldr q1, [sp] // 16-byte Reload
+; CHECK-i64-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-i64-NEXT: ldr x30, [sp, #48] // 8-byte Reload
+; CHECK-i64-NEXT: mov v1.d[1], v4.d[0]
+; CHECK-i64-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-i64-NEXT: add sp, sp, #80
; CHECK-i64-NEXT: ret
%a = call <4 x iXLen> @llvm.lrint.v4iXLen.v4fp128(<4 x fp128> %x)
ret <4 x iXLen> %a
@@ -1726,74 +1718,64 @@ define <8 x iXLen> @lrint_v8fp128(<8 x fp128> %x) nounwind {
;
; CHECK-i64-LABEL: lrint_v8fp128:
; CHECK-i64: // %bb.0:
-; CHECK-i64-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-i64-NEXT: sub sp, sp, #128
-; CHECK-i64-NEXT: addvl sp, sp, #-2
-; CHECK-i64-NEXT: str q0, [sp, #112] // 16-byte Spill
-; CHECK-i64-NEXT: mov v0.16b, v7.16b
-; CHECK-i64-NEXT: stp q6, q5, [sp, #16] // 32-byte Folded Spill
-; CHECK-i64-NEXT: stp q4, q3, [sp, #48] // 32-byte Folded Spill
-; CHECK-i64-NEXT: stp q2, q1, [sp, #80] // 32-byte Folded Spill
+; CHECK-i64-NEXT: sub sp, sp, #176
+; CHECK-i64-NEXT: str x30, [sp, #112] // 8-byte Spill
+; CHECK-i64-NEXT: stp x24, x23, [sp, #128] // 16-byte Folded Spill
+; CHECK-i64-NEXT: stp x22, x21, [sp, #144] // 16-byte Folded Spill
+; CHECK-i64-NEXT: stp x20, x19, [sp, #160] // 16-byte Folded Spill
+; CHECK-i64-NEXT: stp q3, q2, [sp] // 32-byte Folded Spill
+; CHECK-i64-NEXT: stp q5, q4, [sp, #32] // 32-byte Folded Spill
+; CHECK-i64-NEXT: stp q7, q6, [sp, #64] // 32-byte Folded Spill
+; CHECK-i64-NEXT: str q1, [sp, #96] // 16-byte Spill
+; CHECK-i64-NEXT: bl lrintl
+; CHECK-i64-NEXT: ldr q0, [sp] // 16-byte Reload
+; CHECK-i64-NEXT: mov x19, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp] // 16-byte Spill
; CHECK-i64-NEXT: ldr q0, [sp, #16] // 16-byte Reload
+; CHECK-i64-NEXT: mov x20, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #128
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-i64-NEXT: ldr q0, [sp, #32] // 16-byte Reload
+; CHECK-i64-NEXT: mov x21, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #32] // 16-byte Spill
; CHECK-i64-NEXT: ldr q0, [sp, #48] // 16-byte Reload
+; CHECK-i64-NEXT: mov x22, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #32] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #128
-; CHECK-i64-NEXT: ptrue p0.d, vl2
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
-; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-i64-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-i64-NEXT: ldr q0, [sp, #64] // 16-byte Reload
+; CHECK-i64-NEXT: mov x23, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #64] // 16-byte Spill
; CHECK-i64-NEXT: ldr q0, [sp, #80] // 16-byte Reload
+; CHECK-i64-NEXT: mov x24, x0
; CHECK-i64-NEXT: bl lrintl
; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #64] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #128
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: str z0, [x8] // 16-byte Folded Spill
+; CHECK-i64-NEXT: fmov d1, x24
+; CHECK-i64-NEXT: str q0, [sp, #80] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x23
+; CHECK-i64-NEXT: str q0, [sp, #64] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x21
+; CHECK-i64-NEXT: str q0, [sp, #48] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x19
+; CHECK-i64-NEXT: str q0, [sp] // 16-byte Spill
; CHECK-i64-NEXT: ldr q0, [sp, #96] // 16-byte Reload
+; CHECK-i64-NEXT: str q1, [sp, #96] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d1, x22
+; CHECK-i64-NEXT: str q1, [sp, #32] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d1, x20
+; CHECK-i64-NEXT: str q1, [sp, #16] // 16-byte Spill
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #96] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q0, [sp, #112] // 16-byte Reload
-; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #96] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #128
-; CHECK-i64-NEXT: ptrue p0.d, vl2
-; CHECK-i64-NEXT: ldr z2, [x8, #1, mul vl] // 16-byte Folded Reload
+; CHECK-i64-NEXT: fmov d1, x0
+; CHECK-i64-NEXT: ldp q0, q2, [sp] // 32-byte Folded Reload
+; CHECK-i64-NEXT: ldp x20, x19, [sp, #160] // 16-byte Folded Reload
+; CHECK-i64-NEXT: ldr x30, [sp, #112] // 8-byte Reload
+; CHECK-i64-NEXT: ldp x22, x21, [sp, #144] // 16-byte Folded Reload
; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: ldr z1, [x8] // 16-byte Folded Reload
-; CHECK-i64-NEXT: movprfx z3, z2
-; CHECK-i64-NEXT: ext z3.b, z3.b, z2.b, #16
-; CHECK-i64-NEXT: // kill: def $q2 killed $q2 killed $z2
-; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-i64-NEXT: // kill: def $q3 killed $q3 killed $z3
-; CHECK-i64-NEXT: movprfx z1, z0
-; CHECK-i64-NEXT: ext z1.b, z1.b, z0.b, #16
-; CHECK-i64-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-i64-NEXT: // kill: def $q1 killed $q1 killed $z1
-; CHECK-i64-NEXT: addvl sp, sp, #2
-; CHECK-i64-NEXT: add sp, sp, #128
-; CHECK-i64-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-i64-NEXT: ldp q3, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-i64-NEXT: ldp x24, x23, [sp, #128] // 16-byte Folded Reload
+; CHECK-i64-NEXT: mov v1.d[1], v2.d[0]
+; CHECK-i64-NEXT: ldr q2, [sp, #64] // 16-byte Reload
+; CHECK-i64-NEXT: mov v2.d[1], v3.d[0]
+; CHECK-i64-NEXT: ldp q3, q4, [sp, #80] // 32-byte Folded Reload
+; CHECK-i64-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-i64-NEXT: add sp, sp, #176
; CHECK-i64-NEXT: ret
%a = call <8 x iXLen> @llvm.lrint.v8iXLen.v8fp128(<8 x fp128> %x)
ret <8 x iXLen> %a
@@ -1909,158 +1891,134 @@ define <16 x iXLen> @lrint_v16fp128(<16 x fp128> %x) nounwind {
;
; CHECK-i64-LABEL: lrint_v16fp128:
; CHECK-i64: // %bb.0:
-; CHECK-i64-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-i64-NEXT: sub sp, sp, #256
-; CHECK-i64-NEXT: addvl sp, sp, #-4
-; CHECK-i64-NEXT: addvl x8, sp, #4
-; CHECK-i64-NEXT: str q1, [sp, #240] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q1, [x8, #272]
-; CHECK-i64-NEXT: addvl x8, sp, #4
-; CHECK-i64-NEXT: str q0, [sp, #224] // 16-byte Spill
-; CHECK-i64-NEXT: stp q7, q6, [sp, #128] // 32-byte Folded Spill
-; CHECK-i64-NEXT: str q1, [sp, #112] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q1, [x8, #288]
-; CHECK-i64-NEXT: addvl x8, sp, #4
-; CHECK-i64-NEXT: stp q5, q4, [sp, #160] // 32-byte Folded Spill
-; CHECK-i64-NEXT: str q1, [sp, #96] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q1, [x8, #304]
-; CHECK-i64-NEXT: addvl x8, sp, #4
-; CHECK-i64-NEXT: stp q3, q2, [sp, #192] // 32-byte Folded Spill
-; CHECK-i64-NEXT: str q1, [sp, #80] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q1, [x8, #320]
-; CHECK-i64-NEXT: addvl x8, sp, #4
-; CHECK-i64-NEXT: str q1, [sp, #64] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q1, [x8, #336]
-; CHECK-i64-NEXT: addvl x8, sp, #4
+; CHECK-i64-NEXT: sub sp, sp, #368
+; CHECK-i64-NEXT: stp q3, q1, [sp, #240] // 32-byte Folded Spill
+; CHECK-i64-NEXT: ldr q1, [sp, #464]
+; CHECK-i64-NEXT: stp x29, x30, [sp, #272] // 16-byte Folded Spill
+; CHECK-i64-NEXT: stp q2, q4, [sp, #16] // 32-byte Folded Spill
+; CHECK-i64-NEXT: stp q1, q5, [sp, #192] // 32-byte Folded Spill
+; CHECK-i64-NEXT: ldr q1, [sp, #480]
+; CHECK-i64-NEXT: stp x28, x27, [sp, #288] // 16-byte Folded Spill
+; CHECK-i64-NEXT: str q1, [sp, #176] // 16-byte Spill
+; CHECK-i64-NEXT: ldr q1, [sp, #432]
+; CHECK-i64-NEXT: stp x26, x25, [sp, #304] // 16-byte Folded Spill
+; CHECK-i64-NEXT: str q1, [sp, #160] // 16-byte Spill
+; CHECK-i64-NEXT: ldr q1, [sp, #448]
+; CHECK-i64-NEXT: stp x24, x23, [sp, #320] // 16-byte Folded Spill
+; CHECK-i64-NEXT: str q1, [sp, #144] // 16-byte Spill
+; CHECK-i64-NEXT: ldr q1, [sp, #400]
+; CHECK-i64-NEXT: stp x22, x21, [sp, #336] // 16-byte Folded Spill
+; CHECK-i64-NEXT: str q1, [sp, #128] // 16-byte Spill
+; CHECK-i64-NEXT: ldr q1, [sp, #416]
+; CHECK-i64-NEXT: stp x20, x19, [sp, #352] // 16-byte Folded Spill
+; CHECK-i64-NEXT: stp q6, q1, [sp, #96] // 32-byte Folded Spill
+; CHECK-i64-NEXT: ldr q1, [sp, #368]
+; CHECK-i64-NEXT: stp q7, q1, [sp, #64] // 32-byte Folded Spill
+; CHECK-i64-NEXT: ldr q1, [sp, #384]
; CHECK-i64-NEXT: str q1, [sp, #48] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q1, [x8, #352]
-; CHECK-i64-NEXT: addvl x8, sp, #4
-; CHECK-i64-NEXT: str q1, [sp, #32] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q1, [x8, #368]
-; CHECK-i64-NEXT: addvl x8, sp, #4
-; CHECK-i64-NEXT: str q1, [sp, #16] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q1, [x8, #384]
-; CHECK-i64-NEXT: mov v0.16b, v1.16b
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp] // 16-byte Spill
+; CHECK-i64-NEXT: ldr q0, [sp, #240] // 16-byte Reload
+; CHECK-i64-NEXT: str x0, [sp, #224] // 8-byte Spill
+; CHECK-i64-NEXT: bl lrintl
; CHECK-i64-NEXT: ldr q0, [sp, #16] // 16-byte Reload
+; CHECK-i64-NEXT: str x0, [sp, #240] // 8-byte Spill
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #256
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill
-; CHECK-i64-NEXT: ldr q0, [sp, #32] // 16-byte Reload
+; CHECK-i64-NEXT: ldr q0, [sp, #208] // 16-byte Reload
+; CHECK-i64-NEXT: mov x22, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #32] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q0, [sp, #48] // 16-byte Reload
+; CHECK-i64-NEXT: ldr q0, [sp, #32] // 16-byte Reload
+; CHECK-i64-NEXT: str x0, [sp, #208] // 8-byte Spill
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #32] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #256
-; CHECK-i64-NEXT: ptrue p0.d, vl2
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: ldr z1, [x8, #3, mul vl] // 16-byte Folded Reload
-; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-i64-NEXT: str z0, [x8, #3, mul vl] // 16-byte Folded Spill
; CHECK-i64-NEXT: ldr q0, [sp, #64] // 16-byte Reload
+; CHECK-i64-NEXT: mov x24, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #64] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q0, [sp, #80] // 16-byte Reload
-; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #64] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #256
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill
; CHECK-i64-NEXT: ldr q0, [sp, #96] // 16-byte Reload
+; CHECK-i64-NEXT: mov x23, x0
+; CHECK-i64-NEXT: bl lrintl
+; CHECK-i64-NEXT: ldr q0, [sp, #48] // 16-byte Reload
+; CHECK-i64-NEXT: mov x25, x0
+; CHECK-i64-NEXT: bl lrintl
+; CHECK-i64-NEXT: ldr q0, [sp, #80] // 16-byte Reload
+; CHECK-i64-NEXT: mov x26, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #96] // 16-byte Spill
; CHECK-i64-NEXT: ldr q0, [sp, #112] // 16-byte Reload
+; CHECK-i64-NEXT: mov x27, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #96] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #256
-; CHECK-i64-NEXT: ptrue p0.d, vl2
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: ldr z1, [x8, #2, mul vl] // 16-byte Folded Reload
-; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-i64-NEXT: str z0, [x8, #2, mul vl] // 16-byte Folded Spill
; CHECK-i64-NEXT: ldr q0, [sp, #128] // 16-byte Reload
+; CHECK-i64-NEXT: mov x28, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #128] // 16-byte Spill
; CHECK-i64-NEXT: ldr q0, [sp, #144] // 16-byte Reload
+; CHECK-i64-NEXT: mov x29, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #128] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #256
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-i64-NEXT: ldr q0, [sp, #160] // 16-byte Reload
+; CHECK-i64-NEXT: mov x19, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #160] // 16-byte Spill
; CHECK-i64-NEXT: ldr q0, [sp, #176] // 16-byte Reload
+; CHECK-i64-NEXT: mov x20, x0
; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #160] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #256
-; CHECK-i64-NEXT: ptrue p0.d, vl2
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
-; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-i64-NEXT: str z0, [x8, #1, mul vl] // 16-byte Folded Spill
; CHECK-i64-NEXT: ldr q0, [sp, #192] // 16-byte Reload
+; CHECK-i64-NEXT: mov x21, x0
; CHECK-i64-NEXT: bl lrintl
; CHECK-i64-NEXT: fmov d0, x0
+; CHECK-i64-NEXT: ldr x8, [sp, #224] // 8-byte Reload
+; CHECK-i64-NEXT: fmov d1, x23
; CHECK-i64-NEXT: str q0, [sp, #192] // 16-byte Spill
-; CHECK-i64-NEXT: ldr q0, [sp, #208] // 16-byte Reload
-; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #192] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #256
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: str z0, [x8] // 16-byte Folded Spill
-; CHECK-i64-NEXT: ldr q0, [sp, #240] // 16-byte Reload
+; CHECK-i64-NEXT: fmov d0, x21
+; CHECK-i64-NEXT: str q0, [sp, #176] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x20
+; CHECK-i64-NEXT: str q0, [sp, #160] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x19
+; CHECK-i64-NEXT: str q0, [sp, #144] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x29
+; CHECK-i64-NEXT: str q0, [sp, #128] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x27
+; CHECK-i64-NEXT: str q0, [sp, #112] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x25
+; CHECK-i64-NEXT: str q0, [sp, #96] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x24
+; CHECK-i64-NEXT: str q0, [sp, #48] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x22
+; CHECK-i64-NEXT: str q0, [sp, #32] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x8
+; CHECK-i64-NEXT: str q0, [sp, #80] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x28
+; CHECK-i64-NEXT: str q0, [sp, #224] // 16-byte Spill
+; CHECK-i64-NEXT: fmov d0, x26
+; CHECK-i64-NEXT: str q0, [sp, #64] // 16-byte Spill
+; CHECK-i64-NEXT: ldr q0, [sp, #256] // 16-byte Reload
+; CHECK-i64-NEXT: str q1, [sp, #256] // 16-byte Spill
+; CHECK-i64-NEXT: ldr d1, [sp, #208] // 8-byte Reload
+; CHECK-i64-NEXT: str q1, [sp, #208] // 16-byte Spill
+; CHECK-i64-NEXT: ldr d1, [sp, #240] // 8-byte Reload
+; CHECK-i64-NEXT: str q1, [sp, #240] // 16-byte Spill
; CHECK-i64-NEXT: bl lrintl
+; CHECK-i64-NEXT: ldp q1, q2, [sp, #32] // 32-byte Folded Reload
+; CHECK-i64-NEXT: ldr q5, [sp, #64] // 16-byte Reload
+; CHECK-i64-NEXT: ldp q0, q4, [sp, #240] // 32-byte Folded Reload
+; CHECK-i64-NEXT: ldp q16, q3, [sp, #80] // 32-byte Folded Reload
+; CHECK-i64-NEXT: ldp x20, x19, [sp, #352] // 16-byte Folded Reload
+; CHECK-i64-NEXT: mov v1.d[1], v0.d[0]
+; CHECK-i64-NEXT: ldp q7, q0, [sp, #192] // 32-byte Folded Reload
+; CHECK-i64-NEXT: mov v3.d[1], v4.d[0]
+; CHECK-i64-NEXT: ldr q4, [sp, #112] // 16-byte Reload
+; CHECK-i64-NEXT: ldp x22, x21, [sp, #336] // 16-byte Folded Reload
+; CHECK-i64-NEXT: mov v2.d[1], v0.d[0]
; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: str q0, [sp, #240] // 16-byte Spill
+; CHECK-i64-NEXT: mov v4.d[1], v5.d[0]
+; CHECK-i64-NEXT: ldr q5, [sp, #128] // 16-byte Reload
+; CHECK-i64-NEXT: ldp x24, x23, [sp, #320] // 16-byte Folded Reload
+; CHECK-i64-NEXT: ldp x26, x25, [sp, #304] // 16-byte Folded Reload
+; CHECK-i64-NEXT: mov v16.d[1], v0.d[0]
; CHECK-i64-NEXT: ldr q0, [sp, #224] // 16-byte Reload
-; CHECK-i64-NEXT: bl lrintl
-; CHECK-i64-NEXT: fmov d0, x0
-; CHECK-i64-NEXT: ldr q1, [sp, #240] // 16-byte Reload
-; CHECK-i64-NEXT: add x8, sp, #256
-; CHECK-i64-NEXT: ptrue p0.d, vl2
-; CHECK-i64-NEXT: ldr z2, [x8, #1, mul vl] // 16-byte Folded Reload
-; CHECK-i64-NEXT: ldr z4, [x8, #2, mul vl] // 16-byte Folded Reload
-; CHECK-i64-NEXT: ldr z6, [x8, #3, mul vl] // 16-byte Folded Reload
-; CHECK-i64-NEXT: mov v0.d[1], v1.d[0]
-; CHECK-i64-NEXT: ldr z1, [x8] // 16-byte Folded Reload
-; CHECK-i64-NEXT: movprfx z3, z2
-; CHECK-i64-NEXT: ext z3.b, z3.b, z2.b, #16
-; CHECK-i64-NEXT: movprfx z5, z4
-; CHECK-i64-NEXT: ext z5.b, z5.b, z4.b, #16
-; CHECK-i64-NEXT: // kill: def $q2 killed $q2 killed $z2
-; CHECK-i64-NEXT: // kill: def $q4 killed $q4 killed $z4
-; CHECK-i64-NEXT: movprfx z7, z6
-; CHECK-i64-NEXT: ext z7.b, z7.b, z6.b, #16
-; CHECK-i64-NEXT: // kill: def $q6 killed $q6 killed $z6
-; CHECK-i64-NEXT: splice z0.d, p0, z0.d, z1.d
-; CHECK-i64-NEXT: // kill: def $q3 killed $q3 killed $z3
-; CHECK-i64-NEXT: // kill: def $q5 killed $q5 killed $z5
-; CHECK-i64-NEXT: // kill: def $q7 killed $q7 killed $z7
-; CHECK-i64-NEXT: movprfx z1, z0
-; CHECK-i64-NEXT: ext z1.b, z1.b, z0.b, #16
-; CHECK-i64-NEXT: // kill: def $q0 killed $q0 killed $z0
-; CHECK-i64-NEXT: // kill: def $q1 killed $q1 killed $z1
-; CHECK-i64-NEXT: addvl sp, sp, #4
-; CHECK-i64-NEXT: add sp, sp, #256
-; CHECK-i64-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-i64-NEXT: ldp x28, x27, [sp, #288] // 16-byte Folded Reload
+; CHECK-i64-NEXT: mov v5.d[1], v0.d[0]
+; CHECK-i64-NEXT: ldp q0, q6, [sp, #144] // 32-byte Folded Reload
+; CHECK-i64-NEXT: ldp x29, x30, [sp, #272] // 16-byte Folded Reload
+; CHECK-i64-NEXT: mov v6.d[1], v0.d[0]
+; CHECK-i64-NEXT: ldr q0, [sp, #176] // 16-byte Reload
+; CHECK-i64-NEXT: mov v7.d[1], v0.d[0]
+; CHECK-i64-NEXT: mov v0.16b, v16.16b
+; CHECK-i64-NEXT: add sp, sp, #368
; CHECK-i64-NEXT: ret
%a = call <16 x iXLen> @llvm.lrint.v16iXLen.v16fp128(<16 x fp128> %x)
ret <16 x iXLen> %a
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 15b98bb8acc90..2ac5574be0def 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -44933,160 +44933,118 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: v_and_b32_e32 v0, 1, v20
; GFX8-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v21
+; GFX8-NEXT: buffer_load_ushort v6, off, s[0:3], s32
; GFX8-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
; GFX8-NEXT: v_and_b32_e32 v0, 1, v22
+; GFX8-NEXT: v_and_b32_e32 v7, 1, v23
+; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64
+; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128
+; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60
+; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:124
; GFX8-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v23
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v24
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v29
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v30
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v0
-; GFX8-NEXT: buffer_load_ushort v0, off, s[0:3], s32
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v0
-; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
-; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
-; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX8-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
-; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX8-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
-; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
-; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GFX8-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX8-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
-; GFX8-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX8-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
-; GFX8-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
-; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
-; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
-; GFX8-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
-; GFX8-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56
+; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v7
+; GFX8-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52
+; GFX8-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116
+; GFX8-NEXT: v_and_b32_e32 v9, 1, v24
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v9
+; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:48
+; GFX8-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v25
; GFX8-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
-; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
-; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116
-; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
-; GFX8-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:120
-; GFX8-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:56
-; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:124
-; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:60
-; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:128
-; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64
-; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v25
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v24, 16, v26
-; GFX8-NEXT: v_cndmask_b32_e64 v24, v33, v24, s[38:39]
-; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v26, s[36:37]
-; GFX8-NEXT: v_lshrrev_b32_e32 v26, 16, v28
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v27
-; GFX8-NEXT: v_cndmask_b32_e64 v26, v33, v26, s[34:35]
-; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v28, s[30:31]
-; GFX8-NEXT: v_lshrrev_b32_e32 v28, 16, v30
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v29
-; GFX8-NEXT: v_cndmask_b32_e64 v28, v33, v28, s[90:91]
-; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v30, s[88:89]
-; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v32
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v31
-; GFX8-NEXT: v_cndmask_b32_e64 v30, v33, v30, s[78:79]
-; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[76:77]
-; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v23
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v22
-; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[74:75]
-; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73]
-; GFX8-NEXT: v_lshrrev_b32_e32 v23, 16, v21
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v20
-; GFX8-NEXT: v_cndmask_b32_e64 v23, v33, v23, s[62:63]
-; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61]
-; GFX8-NEXT: v_lshrrev_b32_e32 v21, 16, v19
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v18
-; GFX8-NEXT: v_cndmask_b32_e64 v21, v33, v21, s[58:59]
-; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57]
-; GFX8-NEXT: v_lshrrev_b32_e32 v19, 16, v17
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v16
-; GFX8-NEXT: v_cndmask_b32_e64 v19, v33, v19, s[46:47]
-; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45]
-; GFX8-NEXT: v_lshrrev_b32_e32 v17, 16, v15
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v14
-; GFX8-NEXT: v_cndmask_b32_e64 v17, v33, v17, s[42:43]
-; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41]
-; GFX8-NEXT: v_lshrrev_b32_e32 v15, 16, v13
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v12
-; GFX8-NEXT: v_cndmask_b32_e64 v15, v33, v15, s[28:29]
-; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
-; GFX8-NEXT: v_lshrrev_b32_e32 v13, 16, v11
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v10
-; GFX8-NEXT: v_cndmask_b32_e64 v13, v33, v13, s[24:25]
-; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
-; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v9
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v8
-; GFX8-NEXT: v_cndmask_b32_e64 v11, v33, v11, s[20:21]
-; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
-; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v7
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v6
-; GFX8-NEXT: v_cndmask_b32_e64 v9, v33, v9, s[16:17]
-; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
-; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v4
-; GFX8-NEXT: v_cndmask_b32_e64 v7, v33, v7, s[12:13]
-; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
+; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v26
+; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:40
+; GFX8-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v27
+; GFX8-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36
+; GFX8-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v28
+; GFX8-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32
+; GFX8-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v29
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v11
+; GFX8-NEXT: v_and_b32_e32 v11, 1, v30
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[36:37], 1, v11
+; GFX8-NEXT: s_waitcnt vmcnt(14)
+; GFX8-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT: v_cmp_eq_u32_e64 s[38:39], 1, v6
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5
+; GFX8-NEXT: v_cndmask_b32_e64 v16, v5, v4, s[36:37]
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v2
-; GFX8-NEXT: v_cndmask_b32_e64 v5, v33, v5, s[8:9]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v17, v3, v2, s[30:31]
+; GFX8-NEXT: s_waitcnt vmcnt(13)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: s_waitcnt vmcnt(12)
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v33, v3, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v3
-; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v5
-; GFX8-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v9
-; GFX8-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v11
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v13
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v15
-; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v17
-; GFX8-NEXT: v_or_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v7, v14, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v19
-; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v21
-; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v23
-; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v32
-; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v30
-; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v28
-; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v26
-; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v24
+; GFX8-NEXT: v_cndmask_b32_e64 v18, v1, v0, s[88:89]
+; GFX8-NEXT: s_waitcnt vmcnt(11)
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX8-NEXT: s_waitcnt vmcnt(10)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX8-NEXT: v_cndmask_b32_e64 v12, v1, v0, s[78:79]
+; GFX8-NEXT: s_waitcnt vmcnt(9)
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v9
+; GFX8-NEXT: s_waitcnt vmcnt(8)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v10
+; GFX8-NEXT: v_cndmask_b32_e64 v15, v11, v6, s[38:39]
+; GFX8-NEXT: v_cndmask_b32_e64 v11, v1, v0, s[74:75]
+; GFX8-NEXT: s_waitcnt vmcnt(7)
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v21
+; GFX8-NEXT: s_waitcnt vmcnt(6)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v22
+; GFX8-NEXT: v_cndmask_b32_e64 v20, v10, v9, s[72:73]
+; GFX8-NEXT: v_cndmask_b32_e64 v10, v1, v0, s[62:63]
+; GFX8-NEXT: s_waitcnt vmcnt(5)
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v23
+; GFX8-NEXT: s_waitcnt vmcnt(4)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v24
+; GFX8-NEXT: v_cndmask_b32_e64 v9, v1, v0, s[58:59]
+; GFX8-NEXT: s_waitcnt vmcnt(3)
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v25
+; GFX8-NEXT: s_waitcnt vmcnt(2)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v26
+; GFX8-NEXT: v_cndmask_b32_e64 v19, v8, v7, s[76:77]
+; GFX8-NEXT: v_cndmask_b32_e64 v8, v1, v0, s[46:47]
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v27
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v28
+; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, v0, s[42:43]
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92
+; GFX8-NEXT: v_cndmask_b32_e64 v13, v3, v2, s[90:91]
+; GFX8-NEXT: v_cndmask_b32_e64 v21, v22, v21, s[60:61]
+; GFX8-NEXT: v_cndmask_b32_e64 v22, v24, v23, s[56:57]
+; GFX8-NEXT: v_cndmask_b32_e64 v23, v26, v25, s[44:45]
+; GFX8-NEXT: v_cndmask_b32_e64 v14, v5, v4, s[34:35]
+; GFX8-NEXT: v_cndmask_b32_e64 v24, v28, v27, s[40:41]
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v8
+; GFX8-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v10
+; GFX8-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; GFX8-NEXT: v_lshlrev_b32_e32 v12, 16, v12
+; GFX8-NEXT: v_lshlrev_b32_e32 v13, 16, v13
+; GFX8-NEXT: v_lshlrev_b32_e32 v14, 16, v14
+; GFX8-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GFX8-NEXT: v_readlane_b32 s30, v34, 6
-; GFX8-NEXT: v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v10, v20, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v11, v22, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v12, v31, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v13, v29, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v14, v27, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_sdwa v15, v25, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v7, v24, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v8, v23, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v9, v22, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v10, v21, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v11, v20, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v12, v19, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v13, v18, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v14, v17, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_sdwa v15, v16, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: v_readlane_b32 s31, v34, 7
; GFX8-NEXT: v_readlane_b32 s39, v34, 5
; GFX8-NEXT: v_readlane_b32 s38, v34, 4
@@ -45094,6 +45052,74 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX8-NEXT: v_readlane_b32 s36, v34, 2
; GFX8-NEXT: v_readlane_b32 s35, v34, 1
; GFX8-NEXT: v_readlane_b32 s34, v34, 0
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v25, v1, v0, s[26:27]
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
+; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88
+; GFX8-NEXT: v_cndmask_b32_e64 v6, v3, v2, s[28:29]
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_or_b32_sdwa v6, v25, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v26, v1, v0, s[22:23]
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
+; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84
+; GFX8-NEXT: v_cndmask_b32_e64 v5, v3, v2, s[24:25]
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_or_b32_sdwa v5, v26, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v27, v1, v0, s[18:19]
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80
+; GFX8-NEXT: v_cndmask_b32_e64 v4, v3, v2, s[20:21]
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX8-NEXT: v_or_b32_sdwa v4, v27, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v28, v1, v0, s[14:15]
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v2, s[16:17]
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX8-NEXT: v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v29, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v29, v2, s[12:13]
+; GFX8-NEXT: v_cndmask_b32_e64 v29, v1, v0, s[10:11]
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
+; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: v_or_b32_sdwa v2, v29, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v30, 16, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v31, 16, v1
+; GFX8-NEXT: v_cndmask_b32_e64 v30, v31, v30, s[8:9]
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v0, s[6:7]
+; GFX8-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX8-NEXT: v_lshlrev_b32_e32 v30, 16, v30
+; GFX8-NEXT: v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_waitcnt vmcnt(1)
+; GFX8-NEXT: v_lshrrev_b32_e32 v32, 16, v0
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v33, 16, v31
+; GFX8-NEXT: v_cndmask_b32_e64 v32, v33, v32, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v31, v0, vcc
+; GFX8-NEXT: v_lshlrev_b32_e32 v31, 16, v32
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX8-NEXT: s_mov_b64 exec, s[4:5]
@@ -45130,170 +45156,193 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX900-NEXT: v_cmp_eq_u32_e64 s[20:21], 1, v0
; GFX900-NEXT: v_and_b32_e32 v0, 1, v11
; GFX900-NEXT: v_cmp_eq_u32_e64 s[22:23], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v10
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v13
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v12
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v15
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v14
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v17
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v16
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v19
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v18
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v21
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v20
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v23
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v22
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v25
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v24
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v27
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v26
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v29
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v28
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v0
; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], s32
; GFX900-NEXT: v_and_b32_e32 v1, 1, v1
; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v10
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[24:25], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v13
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[26:27], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v12
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[28:29], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v15
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v14
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[42:43], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v17
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[44:45], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v16
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[46:47], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v19
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[56:57], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v18
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[58:59], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v21
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[60:61], 1, v1
+; GFX900-NEXT: v_and_b32_e32 v1, 1, v20
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[62:63], 1, v1
+; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:64
+; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128
+; GFX900-NEXT: v_and_b32_e32 v3, 1, v23
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[72:73], 1, v3
+; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:60
+; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124
+; GFX900-NEXT: v_and_b32_e32 v5, 1, v22
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[74:75], 1, v5
+; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56
+; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120
+; GFX900-NEXT: v_and_b32_e32 v7, 1, v25
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[76:77], 1, v7
+; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:52
+; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:116
+; GFX900-NEXT: v_and_b32_e32 v9, 1, v24
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[78:79], 1, v9
+; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:48
+; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:112
+; GFX900-NEXT: v_and_b32_e32 v11, 1, v27
+; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
+; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:108
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[88:89], 1, v11
+; GFX900-NEXT: v_and_b32_e32 v11, 1, v26
+; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:40
+; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:104
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[90:91], 1, v11
+; GFX900-NEXT: v_and_b32_e32 v11, 1, v29
+; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36
+; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[92:93], 1, v11
+; GFX900-NEXT: v_and_b32_e32 v11, 1, v28
+; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32
+; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:96
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[94:95], 1, v11
+; GFX900-NEXT: v_and_b32_e32 v11, 1, v30
+; GFX900-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v11
+; GFX900-NEXT: s_waitcnt vmcnt(18)
; GFX900-NEXT: v_and_b32_e32 v0, 1, v0
; GFX900-NEXT: v_cmp_eq_u32_e64 s[30:31], 1, v0
-; GFX900-NEXT: v_and_b32_e32 v0, 1, v30
-; GFX900-NEXT: v_cmp_eq_u32_e64 s[34:35], 1, v0
-; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
-; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
-; GFX900-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX900-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
-; GFX900-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX900-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
-; GFX900-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX900-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
-; GFX900-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GFX900-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GFX900-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX900-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
-; GFX900-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX900-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
-; GFX900-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX900-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
-; GFX900-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
-; GFX900-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
-; GFX900-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
-; GFX900-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
-; GFX900-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX900-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
-; GFX900-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
-; GFX900-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
-; GFX900-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
-; GFX900-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
-; GFX900-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
-; GFX900-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124
-; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60
-; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:128
-; GFX900-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
-; GFX900-NEXT: s_waitcnt vmcnt(0)
-; GFX900-NEXT: v_cndmask_b32_e64 v30, v31, v32, s[34:35]
-; GFX900-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX900-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX900-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[30:31]
-; GFX900-NEXT: v_cndmask_b32_e64 v32, v28, v29, s[94:95]
-; GFX900-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX900-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; GFX900-NEXT: v_cndmask_b32_e64 v28, v28, v29, s[92:93]
-; GFX900-NEXT: v_cndmask_b32_e64 v29, v26, v27, s[90:91]
-; GFX900-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX900-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX900-NEXT: v_cndmask_b32_e64 v26, v26, v27, s[88:89]
-; GFX900-NEXT: v_cndmask_b32_e64 v27, v24, v25, s[78:79]
-; GFX900-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GFX900-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX900-NEXT: v_cndmask_b32_e64 v24, v24, v25, s[76:77]
-; GFX900-NEXT: v_cndmask_b32_e64 v25, v22, v23, s[74:75]
-; GFX900-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GFX900-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX900-NEXT: v_cndmask_b32_e64 v22, v22, v23, s[72:73]
-; GFX900-NEXT: v_cndmask_b32_e64 v23, v20, v21, s[62:63]
-; GFX900-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX900-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX900-NEXT: v_cndmask_b32_e64 v20, v20, v21, s[60:61]
-; GFX900-NEXT: v_cndmask_b32_e64 v21, v18, v19, s[58:59]
-; GFX900-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX900-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX900-NEXT: v_cndmask_b32_e64 v18, v18, v19, s[56:57]
-; GFX900-NEXT: v_cndmask_b32_e64 v19, v16, v17, s[46:47]
-; GFX900-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX900-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX900-NEXT: v_cndmask_b32_e64 v16, v16, v17, s[44:45]
-; GFX900-NEXT: v_cndmask_b32_e64 v17, v14, v15, s[42:43]
-; GFX900-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX900-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX900-NEXT: v_cndmask_b32_e64 v14, v14, v15, s[40:41]
-; GFX900-NEXT: v_cndmask_b32_e64 v15, v12, v13, s[28:29]
-; GFX900-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX900-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX900-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[26:27]
-; GFX900-NEXT: v_cndmask_b32_e64 v13, v10, v11, s[24:25]
-; GFX900-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX900-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX900-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[22:23]
-; GFX900-NEXT: v_cndmask_b32_e64 v11, v8, v9, s[20:21]
-; GFX900-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX900-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX900-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[18:19]
-; GFX900-NEXT: v_cndmask_b32_e64 v9, v6, v7, s[16:17]
-; GFX900-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX900-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX900-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[14:15]
-; GFX900-NEXT: v_cndmask_b32_e64 v7, v4, v5, s[12:13]
-; GFX900-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX900-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX900-NEXT: v_cndmask_b32_e64 v4, v4, v5, s[10:11]
-; GFX900-NEXT: v_cndmask_b32_e64 v5, v2, v3, s[8:9]
-; GFX900-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX900-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7]
-; GFX900-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5]
+; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; GFX900-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:92
+; GFX900-NEXT: s_waitcnt vmcnt(18)
+; GFX900-NEXT: v_cndmask_b32_e64 v15, v2, v1, s[34:35]
; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX900-NEXT: v_cndmask_b32_e64 v16, v2, v1, s[30:31]
+; GFX900-NEXT: s_waitcnt vmcnt(17)
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX900-NEXT: s_waitcnt vmcnt(16)
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v4
+; GFX900-NEXT: v_cndmask_b32_e64 v17, v2, v1, s[92:93]
+; GFX900-NEXT: s_waitcnt vmcnt(15)
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v5
+; GFX900-NEXT: s_waitcnt vmcnt(14)
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v6
+; GFX900-NEXT: v_cndmask_b32_e64 v18, v2, v1, s[88:89]
+; GFX900-NEXT: s_waitcnt vmcnt(13)
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX900-NEXT: s_waitcnt vmcnt(12)
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v8
+; GFX900-NEXT: v_cndmask_b32_e64 v19, v2, v1, s[76:77]
+; GFX900-NEXT: s_waitcnt vmcnt(11)
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v9
+; GFX900-NEXT: s_waitcnt vmcnt(10)
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v10
+; GFX900-NEXT: v_cndmask_b32_e64 v20, v2, v1, s[72:73]
+; GFX900-NEXT: s_waitcnt vmcnt(9)
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v21
+; GFX900-NEXT: s_waitcnt vmcnt(8)
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v22
+; GFX900-NEXT: v_cndmask_b32_e64 v11, v10, v9, s[74:75]
+; GFX900-NEXT: v_cndmask_b32_e64 v10, v22, v21, s[62:63]
+; GFX900-NEXT: v_cndmask_b32_e64 v21, v2, v1, s[60:61]
+; GFX900-NEXT: s_waitcnt vmcnt(7)
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v23
+; GFX900-NEXT: s_waitcnt vmcnt(6)
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v24
+; GFX900-NEXT: v_cndmask_b32_e64 v22, v2, v1, s[56:57]
+; GFX900-NEXT: s_waitcnt vmcnt(5)
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v25
+; GFX900-NEXT: s_waitcnt vmcnt(4)
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v26
+; GFX900-NEXT: v_cndmask_b32_e64 v9, v24, v23, s[58:59]
+; GFX900-NEXT: v_cndmask_b32_e64 v23, v2, v1, s[44:45]
+; GFX900-NEXT: s_waitcnt vmcnt(3)
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v27
+; GFX900-NEXT: s_waitcnt vmcnt(2)
+; GFX900-NEXT: v_lshrrev_b32_e32 v2, 16, v28
+; GFX900-NEXT: v_cndmask_b32_e64 v13, v6, v5, s[90:91]
+; GFX900-NEXT: v_cndmask_b32_e64 v24, v2, v1, s[40:41]
+; GFX900-NEXT: v_cndmask_b32_e64 v12, v8, v7, s[78:79]
+; GFX900-NEXT: v_cndmask_b32_e64 v8, v26, v25, s[46:47]
+; GFX900-NEXT: v_cndmask_b32_e64 v14, v4, v3, s[94:95]
+; GFX900-NEXT: v_cndmask_b32_e64 v7, v28, v27, s[42:43]
; GFX900-NEXT: v_readlane_b32 s30, v33, 2
-; GFX900-NEXT: v_perm_b32 v0, v0, v3, s4
-; GFX900-NEXT: v_perm_b32 v1, v2, v5, s4
-; GFX900-NEXT: v_perm_b32 v2, v4, v7, s4
-; GFX900-NEXT: v_perm_b32 v3, v6, v9, s4
-; GFX900-NEXT: v_perm_b32 v4, v8, v11, s4
-; GFX900-NEXT: v_perm_b32 v5, v10, v13, s4
-; GFX900-NEXT: v_perm_b32 v6, v12, v15, s4
-; GFX900-NEXT: v_perm_b32 v7, v14, v17, s4
-; GFX900-NEXT: v_perm_b32 v8, v16, v19, s4
-; GFX900-NEXT: v_perm_b32 v9, v18, v21, s4
-; GFX900-NEXT: v_perm_b32 v10, v20, v23, s4
-; GFX900-NEXT: v_perm_b32 v11, v22, v25, s4
-; GFX900-NEXT: v_perm_b32 v12, v24, v27, s4
-; GFX900-NEXT: v_perm_b32 v13, v26, v29, s4
-; GFX900-NEXT: v_perm_b32 v14, v28, v32, s4
-; GFX900-NEXT: v_perm_b32 v15, v31, v30, s4
; GFX900-NEXT: v_readlane_b32 s31, v33, 3
; GFX900-NEXT: v_readlane_b32 s35, v33, 1
; GFX900-NEXT: v_readlane_b32 s34, v33, 0
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e64 v6, v29, v0, s[28:29]
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v29
+; GFX900-NEXT: v_cndmask_b32_e64 v25, v1, v0, s[26:27]
+; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
+; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e64 v5, v1, v0, s[24:25]
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v26, v1, v0, s[22:23]
+; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
+; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e64 v4, v1, v0, s[20:21]
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v27, v1, v0, s[18:19]
+; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e64 v3, v1, v0, s[16:17]
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v28, v1, v0, s[14:15]
+; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e64 v2, v1, v0, s[12:13]
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX900-NEXT: v_cndmask_b32_e64 v29, v1, v0, s[10:11]
+; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
+; GFX900-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:72
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e64 v1, v30, v0, s[8:9]
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v30, 16, v30
+; GFX900-NEXT: v_cndmask_b32_e64 v30, v30, v0, s[6:7]
+; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX900-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: v_cndmask_b32_e64 v32, v31, v0, s[4:5]
+; GFX900-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX900-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX900-NEXT: v_cndmask_b32_e32 v0, v31, v0, vcc
+; GFX900-NEXT: s_mov_b32 s4, 0x5040100
+; GFX900-NEXT: v_perm_b32 v0, v0, v32, s4
+; GFX900-NEXT: v_perm_b32 v1, v30, v1, s4
+; GFX900-NEXT: v_perm_b32 v2, v29, v2, s4
+; GFX900-NEXT: v_perm_b32 v3, v28, v3, s4
+; GFX900-NEXT: v_perm_b32 v4, v27, v4, s4
+; GFX900-NEXT: v_perm_b32 v5, v26, v5, s4
+; GFX900-NEXT: v_perm_b32 v6, v25, v6, s4
+; GFX900-NEXT: v_perm_b32 v7, v24, v7, s4
+; GFX900-NEXT: v_perm_b32 v8, v23, v8, s4
+; GFX900-NEXT: v_perm_b32 v9, v22, v9, s4
+; GFX900-NEXT: v_perm_b32 v10, v21, v10, s4
+; GFX900-NEXT: v_perm_b32 v11, v20, v11, s4
+; GFX900-NEXT: v_perm_b32 v12, v19, v12, s4
+; GFX900-NEXT: v_perm_b32 v13, v18, v13, s4
+; GFX900-NEXT: v_perm_b32 v14, v17, v14, s4
+; GFX900-NEXT: v_perm_b32 v15, v16, v15, s4
; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1
; GFX900-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
; GFX900-NEXT: s_mov_b64 exec, s[4:5]
@@ -45315,35 +45364,40 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX950-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse
-; GFX950-NEXT: scratch_load_dword v31, off, s32 offset:60
-; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:124
-; GFX950-NEXT: scratch_load_ushort v33, off, s32
-; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:64
-; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:128
-; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:120
-; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:56
-; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:116
-; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:52
-; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:112
-; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:48
-; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:88
-; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:24
-; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:92
-; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:28
-; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:108
-; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:44
-; GFX950-NEXT: scratch_load_dword v40, off, s32 offset:96
+; GFX950-NEXT: scratch_load_ushort v31, off, s32
+; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:64
+; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:128
+; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:60
+; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:124
+; GFX950-NEXT: scratch_load_dword v36, off, s32 offset:56
+; GFX950-NEXT: scratch_load_dword v37, off, s32 offset:120
+; GFX950-NEXT: scratch_load_dword v38, off, s32 offset:52
+; GFX950-NEXT: scratch_load_dword v39, off, s32 offset:116
+; GFX950-NEXT: scratch_load_dword v48, off, s32 offset:48
+; GFX950-NEXT: scratch_load_dword v49, off, s32 offset:112
+; GFX950-NEXT: scratch_load_dword v50, off, s32 offset:44
+; GFX950-NEXT: scratch_load_dword v51, off, s32 offset:108
+; GFX950-NEXT: scratch_load_dword v52, off, s32 offset:40
+; GFX950-NEXT: scratch_load_dword v53, off, s32 offset:104
+; GFX950-NEXT: scratch_load_dword v54, off, s32 offset:36
+; GFX950-NEXT: scratch_load_dword v55, off, s32 offset:100
; GFX950-NEXT: scratch_load_dword v41, off, s32 offset:32
-; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:100
-; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:36
-; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:104
-; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:40
-; GFX950-NEXT: v_and_b32_e32 v29, 1, v29
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v29
-; GFX950-NEXT: scratch_load_dword v29, off, s32 offset:84
-; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:20
+; GFX950-NEXT: scratch_load_dword v42, off, s32 offset:96
+; GFX950-NEXT: scratch_load_dword v43, off, s32 offset:28
+; GFX950-NEXT: scratch_load_dword v44, off, s32 offset:92
+; GFX950-NEXT: scratch_load_dword v45, off, s32 offset:24
+; GFX950-NEXT: scratch_load_dword v46, off, s32 offset:88
+; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:20
+; GFX950-NEXT: scratch_load_dword v56, off, s32 offset:84
+; GFX950-NEXT: v_and_b32_e32 v40, 1, v15
+; GFX950-NEXT: v_and_b32_e32 v15, 1, v30
+; GFX950-NEXT: scratch_load_dword v30, off, s32 offset:16
+; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:80
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX950-NEXT: scratch_load_dword v58, off, s32 offset:12
+; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:76
; GFX950-NEXT: v_and_b32_e32 v28, 1, v28
-; GFX950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v28
+; GFX950-NEXT: v_and_b32_e32 v29, 1, v29
; GFX950-NEXT: v_and_b32_e32 v26, 1, v26
; GFX950-NEXT: v_and_b32_e32 v27, 1, v27
; GFX950-NEXT: v_and_b32_e32 v24, 1, v24
@@ -45357,7 +45411,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX950-NEXT: v_and_b32_e32 v16, 1, v16
; GFX950-NEXT: v_and_b32_e32 v17, 1, v17
; GFX950-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX950-NEXT: v_and_b32_e32 v15, 1, v15
; GFX950-NEXT: v_and_b32_e32 v12, 1, v12
; GFX950-NEXT: v_and_b32_e32 v13, 1, v13
; GFX950-NEXT: v_and_b32_e32 v10, 1, v10
@@ -45372,147 +45425,149 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX950-NEXT: v_and_b32_e32 v3, 1, v3
; GFX950-NEXT: v_and_b32_e32 v0, 1, v0
; GFX950-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX950-NEXT: s_waitcnt vmcnt(24)
-; GFX950-NEXT: v_lshrrev_b32_e32 v46, 16, v31
-; GFX950-NEXT: s_waitcnt vmcnt(23)
-; GFX950-NEXT: v_lshrrev_b32_e32 v47, 16, v32
-; GFX950-NEXT: s_waitcnt vmcnt(22)
-; GFX950-NEXT: v_and_b32_e32 v28, 1, v33
-; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:80
-; GFX950-NEXT: scratch_load_dword v57, off, s32 offset:16
-; GFX950-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v28
-; GFX950-NEXT: v_and_b32_e32 v28, 1, v30
-; GFX950-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v28
-; GFX950-NEXT: scratch_load_dword v28, off, s32 offset:76
-; GFX950-NEXT: scratch_load_dword v30, off, s32 offset:12
-; GFX950-NEXT: s_waitcnt vmcnt(25)
-; GFX950-NEXT: v_lshrrev_b32_e32 v58, 16, v34
-; GFX950-NEXT: s_waitcnt vmcnt(24)
-; GFX950-NEXT: v_lshrrev_b32_e32 v59, 16, v35
-; GFX950-NEXT: v_cndmask_b32_e64 v34, v35, v34, s[4:5]
-; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:72
-; GFX950-NEXT: v_cndmask_b32_e64 v58, v59, v58, s[2:3]
-; GFX950-NEXT: scratch_load_dword v59, off, s32 offset:8
-; GFX950-NEXT: v_cndmask_b32_e64 v31, v32, v31, s[0:1]
-; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:68
-; GFX950-NEXT: v_cndmask_b32_e32 v46, v47, v46, vcc
-; GFX950-NEXT: scratch_load_dword v47, off, s32 offset:4
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26
; GFX950-NEXT: s_mov_b32 s0, 0x5040100
+; GFX950-NEXT: s_waitcnt vmcnt(28)
+; GFX950-NEXT: v_and_b32_e32 v31, 1, v31
; GFX950-NEXT: s_waitcnt vmcnt(26)
-; GFX950-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc
-; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX950-NEXT: v_cndmask_b32_e32 v15, v33, v32, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v32, 16, v32
+; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v31
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v31, v33, v32, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v28
+; GFX950-NEXT: scratch_load_dword v32, off, s32 offset:8
+; GFX950-NEXT: scratch_load_dword v33, off, s32 offset:72
+; GFX950-NEXT: s_waitcnt vmcnt(26)
+; GFX950-NEXT: v_cndmask_b32_e32 v28, v35, v34, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v34, 16, v34
+; GFX950-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v29
+; GFX950-NEXT: v_perm_b32 v15, v31, v15, s0
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v29, v35, v34, vcc
+; GFX950-NEXT: scratch_load_dword v34, off, s32 offset:4
+; GFX950-NEXT: scratch_load_dword v35, off, s32 offset:68
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v26
+; GFX950-NEXT: s_waitcnt vmcnt(26)
+; GFX950-NEXT: s_nop 0
+; GFX950-NEXT: v_cndmask_b32_e32 v26, v37, v36, vcc
; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v36
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v37
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v27
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v27, v37, v36, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24
+; GFX950-NEXT: s_waitcnt vmcnt(25)
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v38
; GFX950-NEXT: s_waitcnt vmcnt(24)
-; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v39
-; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v38
-; GFX950-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v39
+; GFX950-NEXT: v_cndmask_b32_e32 v24, v39, v38, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v25
-; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: s_waitcnt vmcnt(12)
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v44
; GFX950-NEXT: v_cndmask_b32_e32 v25, v37, v36, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v22
-; GFX950-NEXT: s_waitcnt vmcnt(22)
-; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v49
-; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v48
-; GFX950-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v48
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v49
+; GFX950-NEXT: v_cndmask_b32_e32 v22, v49, v48, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v23
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v23, v37, v36, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20
-; GFX950-NEXT: s_waitcnt vmcnt(16)
-; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v55
-; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v54
-; GFX950-NEXT: v_cndmask_b32_e32 v20, v54, v55, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v50
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v51
+; GFX950-NEXT: v_cndmask_b32_e32 v20, v51, v50, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v21
; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v21, v37, v36, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18
-; GFX950-NEXT: s_waitcnt vmcnt(10)
-; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v45
-; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v44
-; GFX950-NEXT: v_cndmask_b32_e32 v18, v44, v45, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v52
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v53
+; GFX950-NEXT: v_cndmask_b32_e32 v18, v53, v52, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19
-; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v19, v37, v36, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16
-; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v43
-; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v42
-; GFX950-NEXT: v_cndmask_b32_e32 v16, v42, v43, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v54
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v55
+; GFX950-NEXT: v_cndmask_b32_e32 v16, v55, v54, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
-; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX950-NEXT: s_nop 1
; GFX950-NEXT: v_cndmask_b32_e32 v17, v37, v36, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14
; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v41
-; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v40
-; GFX950-NEXT: v_cndmask_b32_e32 v14, v40, v41, vcc
-; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v42
+; GFX950-NEXT: v_cndmask_b32_e32 v14, v42, v41, vcc
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v40
+; GFX950-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse
; GFX950-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
-; GFX950-NEXT: v_cndmask_b32_e32 v15, v37, v36, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v36, v37, v36, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12
-; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v53
-; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v52
-; GFX950-NEXT: v_cndmask_b32_e32 v12, v52, v53, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v43
+; GFX950-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX950-NEXT: v_cndmask_b32_e32 v12, v44, v43, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v13, v37, v36, vcc
+; GFX950-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX950-NEXT: v_cndmask_b32_e32 v13, v38, v37, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10
-; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v51
-; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v50
-; GFX950-NEXT: v_cndmask_b32_e32 v10, v50, v51, vcc
+; GFX950-NEXT: s_waitcnt vmcnt(11)
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v45
+; GFX950-NEXT: s_waitcnt vmcnt(10)
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v46
+; GFX950-NEXT: v_cndmask_b32_e32 v10, v46, v45, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v11, v37, v36, vcc
+; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX950-NEXT: v_cndmask_b32_e32 v11, v38, v37, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX950-NEXT: s_waitcnt vmcnt(9)
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v47
; GFX950-NEXT: s_waitcnt vmcnt(8)
-; GFX950-NEXT: v_lshrrev_b32_e32 v36, 16, v56
-; GFX950-NEXT: v_cndmask_b32_e32 v8, v29, v56, vcc
-; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v29
+; GFX950-NEXT: v_lshrrev_b32_e32 v38, 16, v56
+; GFX950-NEXT: v_cndmask_b32_e32 v8, v56, v47, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9
; GFX950-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v9, v29, v36, vcc
+; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX950-NEXT: v_cndmask_b32_e32 v9, v38, v37, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
; GFX950-NEXT: s_waitcnt vmcnt(6)
-; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v57
-; GFX950-NEXT: v_cndmask_b32_e32 v6, v33, v57, vcc
-; GFX950-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v57
+; GFX950-NEXT: v_cndmask_b32_e32 v6, v57, v30, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
; GFX950-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v7, v33, v29, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v7, v37, v30, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX950-NEXT: s_waitcnt vmcnt(5)
+; GFX950-NEXT: v_lshrrev_b32_e32 v30, 16, v58
; GFX950-NEXT: s_waitcnt vmcnt(4)
-; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v30
-; GFX950-NEXT: v_cndmask_b32_e32 v4, v28, v30, vcc
-; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v28
+; GFX950-NEXT: v_lshrrev_b32_e32 v37, 16, v59
+; GFX950-NEXT: v_cndmask_b32_e32 v4, v59, v58, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
-; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v5, v28, v29, vcc
+; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX950-NEXT: v_cndmask_b32_e32 v5, v37, v30, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX950-NEXT: s_waitcnt vmcnt(3)
+; GFX950-NEXT: v_lshrrev_b32_e32 v30, 16, v32
; GFX950-NEXT: s_waitcnt vmcnt(2)
-; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v59
-; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v35
-; GFX950-NEXT: v_cndmask_b32_e32 v2, v35, v59, vcc
+; GFX950-NEXT: v_cndmask_b32_e32 v2, v33, v32, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v32, 16, v33
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
-; GFX950-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v3, v29, v28, vcc
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v3, v32, v30, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX950-NEXT: s_waitcnt vmcnt(1)
+; GFX950-NEXT: v_lshrrev_b32_e32 v30, 16, v34
; GFX950-NEXT: s_waitcnt vmcnt(0)
-; GFX950-NEXT: v_lshrrev_b32_e32 v28, 16, v47
-; GFX950-NEXT: v_lshrrev_b32_e32 v29, 16, v32
-; GFX950-NEXT: v_cndmask_b32_e32 v0, v32, v47, vcc
+; GFX950-NEXT: v_lshrrev_b32_e32 v32, 16, v35
+; GFX950-NEXT: v_cndmask_b32_e32 v0, v35, v34, vcc
; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX950-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse
-; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: v_cndmask_b32_e32 v1, v29, v28, vcc
+; GFX950-NEXT: s_nop 1
+; GFX950-NEXT: v_cndmask_b32_e32 v1, v32, v30, vcc
; GFX950-NEXT: v_perm_b32 v0, v1, v0, s0
; GFX950-NEXT: v_perm_b32 v1, v3, v2, s0
; GFX950-NEXT: v_perm_b32 v2, v5, v4, s0
@@ -45520,17 +45575,14 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX950-NEXT: v_perm_b32 v4, v9, v8, s0
; GFX950-NEXT: v_perm_b32 v5, v11, v10, s0
; GFX950-NEXT: v_perm_b32 v6, v13, v12, s0
-; GFX950-NEXT: v_perm_b32 v7, v15, v14, s0
+; GFX950-NEXT: v_perm_b32 v7, v36, v14, s0
; GFX950-NEXT: v_perm_b32 v8, v17, v16, s0
; GFX950-NEXT: v_perm_b32 v9, v19, v18, s0
; GFX950-NEXT: v_perm_b32 v10, v21, v20, s0
; GFX950-NEXT: v_perm_b32 v11, v23, v22, s0
; GFX950-NEXT: v_perm_b32 v12, v25, v24, s0
; GFX950-NEXT: v_perm_b32 v13, v27, v26, s0
-; GFX950-NEXT: v_perm_b32 v14, v46, v31, s0
-; GFX950-NEXT: v_perm_b32 v15, v58, v34, s0
-; GFX950-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse
-; GFX950-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX950-NEXT: v_perm_b32 v14, v29, v28, s0
; GFX950-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_vselect_v32bf16:
@@ -45602,120 +45654,150 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX10-NEXT: v_cmp_eq_u32_e64 s43, 1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v30
; GFX10-NEXT: v_cmp_eq_u32_e64 s44, 1, v0
-; GFX10-NEXT: s_clause 0x1f
-; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68
-; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
-; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72
-; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8
-; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76
-; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12
-; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80
-; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:16
-; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84
-; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:20
-; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88
-; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:24
-; GFX10-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:92
-; GFX10-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28
-; GFX10-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:96
-; GFX10-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32
-; GFX10-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:100
-; GFX10-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36
-; GFX10-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:104
-; GFX10-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:40
-; GFX10-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:108
-; GFX10-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:44
-; GFX10-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:112
-; GFX10-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:48
-; GFX10-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:116
-; GFX10-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52
-; GFX10-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:120
-; GFX10-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:56
-; GFX10-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:124
-; GFX10-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:60
-; GFX10-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:128
-; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:64
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:128
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cndmask_b32_e64 v32, v30, v31, s44
-; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31
-; GFX10-NEXT: v_lshrrev_b32_e32 v30, 16, v30
-; GFX10-NEXT: v_cndmask_b32_e64 v30, v30, v31, s43
-; GFX10-NEXT: v_cndmask_b32_e64 v31, v28, v29, s42
-; GFX10-NEXT: v_lshrrev_b32_e32 v29, 16, v29
-; GFX10-NEXT: v_lshrrev_b32_e32 v28, 16, v28
-; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v29, s41
-; GFX10-NEXT: v_cndmask_b32_e64 v29, v26, v27, s40
-; GFX10-NEXT: v_lshrrev_b32_e32 v27, 16, v27
-; GFX10-NEXT: v_lshrrev_b32_e32 v26, 16, v26
-; GFX10-NEXT: v_cndmask_b32_e64 v26, v26, v27, s29
-; GFX10-NEXT: v_cndmask_b32_e64 v27, v24, v25, s28
-; GFX10-NEXT: v_lshrrev_b32_e32 v25, 16, v25
-; GFX10-NEXT: v_lshrrev_b32_e32 v24, 16, v24
-; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v25, s27
-; GFX10-NEXT: v_cndmask_b32_e64 v25, v22, v23, s26
-; GFX10-NEXT: v_lshrrev_b32_e32 v23, 16, v23
-; GFX10-NEXT: v_lshrrev_b32_e32 v22, 16, v22
-; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v23, s25
-; GFX10-NEXT: v_cndmask_b32_e64 v23, v20, v21, s24
-; GFX10-NEXT: v_lshrrev_b32_e32 v21, 16, v21
-; GFX10-NEXT: v_lshrrev_b32_e32 v20, 16, v20
-; GFX10-NEXT: v_cndmask_b32_e64 v20, v20, v21, s23
-; GFX10-NEXT: v_cndmask_b32_e64 v21, v18, v19, s22
-; GFX10-NEXT: v_lshrrev_b32_e32 v19, 16, v19
-; GFX10-NEXT: v_lshrrev_b32_e32 v18, 16, v18
-; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v19, s21
-; GFX10-NEXT: v_cndmask_b32_e64 v19, v16, v17, s20
-; GFX10-NEXT: v_lshrrev_b32_e32 v17, 16, v17
-; GFX10-NEXT: v_lshrrev_b32_e32 v16, 16, v16
-; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v17, s19
-; GFX10-NEXT: v_cndmask_b32_e64 v17, v14, v15, s18
-; GFX10-NEXT: v_lshrrev_b32_e32 v15, 16, v15
-; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v14
-; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v15, s17
-; GFX10-NEXT: v_cndmask_b32_e64 v15, v12, v13, s16
-; GFX10-NEXT: v_lshrrev_b32_e32 v13, 16, v13
-; GFX10-NEXT: v_lshrrev_b32_e32 v12, 16, v12
-; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v13, s15
-; GFX10-NEXT: v_cndmask_b32_e64 v13, v10, v11, s14
-; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v11
-; GFX10-NEXT: v_lshrrev_b32_e32 v10, 16, v10
-; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v11, s13
-; GFX10-NEXT: v_cndmask_b32_e64 v11, v8, v9, s12
-; GFX10-NEXT: v_lshrrev_b32_e32 v9, 16, v9
-; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v8
-; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v9, s11
-; GFX10-NEXT: v_cndmask_b32_e64 v9, v6, v7, s10
-; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v7
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v7, s9
-; GFX10-NEXT: v_cndmask_b32_e64 v7, v4, v5, s8
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v5, s7
-; GFX10-NEXT: v_cndmask_b32_e64 v5, v2, v3, s6
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v3, s5
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v0, v1, s4
+; GFX10-NEXT: v_cndmask_b32_e64 v15, v1, v0, s44
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v16, v1, v0, s43
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124
+; GFX10-NEXT: v_perm_b32 v15, v16, v15, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v14, v1, v0, s42
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT: v_perm_b32 v1, v2, v5, 0x5040100
-; GFX10-NEXT: v_perm_b32 v2, v4, v7, 0x5040100
-; GFX10-NEXT: v_perm_b32 v4, v8, v11, 0x5040100
-; GFX10-NEXT: v_perm_b32 v5, v10, v13, 0x5040100
-; GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
-; GFX10-NEXT: v_perm_b32 v3, v6, v9, 0x5040100
-; GFX10-NEXT: v_perm_b32 v6, v12, v15, 0x5040100
-; GFX10-NEXT: v_perm_b32 v7, v14, v17, 0x5040100
-; GFX10-NEXT: v_perm_b32 v8, v16, v19, 0x5040100
-; GFX10-NEXT: v_perm_b32 v9, v18, v21, 0x5040100
-; GFX10-NEXT: v_perm_b32 v10, v20, v23, 0x5040100
-; GFX10-NEXT: v_perm_b32 v11, v22, v25, 0x5040100
-; GFX10-NEXT: v_perm_b32 v12, v24, v27, 0x5040100
-; GFX10-NEXT: v_perm_b32 v13, v26, v29, 0x5040100
-; GFX10-NEXT: v_perm_b32 v14, v28, v31, 0x5040100
-; GFX10-NEXT: v_perm_b32 v15, v30, v32, 0x5040100
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v17, v1, v0, s41
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120
+; GFX10-NEXT: v_perm_b32 v14, v17, v14, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v13, v1, v0, s40
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v18, v1, v0, s29
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116
+; GFX10-NEXT: v_perm_b32 v13, v18, v13, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v12, v1, v0, s28
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v19, v1, v0, s27
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:112
+; GFX10-NEXT: v_perm_b32 v12, v19, v12, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v11, v1, v0, s26
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v20, v1, v0, s25
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:108
+; GFX10-NEXT: v_perm_b32 v11, v20, v11, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v10, v1, v0, s24
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v21, v1, v0, s23
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104
+; GFX10-NEXT: v_perm_b32 v10, v21, v10, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v9, v1, v0, s22
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v22, v1, v0, s21
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:36
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:100
+; GFX10-NEXT: v_perm_b32 v9, v22, v9, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v8, v1, v0, s20
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v23, v1, v0, s19
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:96
+; GFX10-NEXT: v_perm_b32 v8, v23, v8, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, v0, s18
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v24, v1, v0, s17
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92
+; GFX10-NEXT: v_perm_b32 v7, v24, v7, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v6, v1, v0, s16
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v25, v1, v0, s15
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:88
+; GFX10-NEXT: v_perm_b32 v6, v25, v6, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v5, v1, v0, s14
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v26, v1, v0, s13
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:84
+; GFX10-NEXT: v_perm_b32 v5, v26, v5, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v4, v1, v0, s12
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v27, v1, v0, s11
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80
+; GFX10-NEXT: v_perm_b32 v4, v27, v4, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, v0, s10
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v28, v1, v0, s9
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76
+; GFX10-NEXT: v_perm_b32 v3, v28, v3, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v1, v0, s8
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v29, v1, v0, s7
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8
+; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72
+; GFX10-NEXT: v_perm_b32 v2, v29, v2, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v30, v1, v0, s6
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v0, s5
+; GFX10-NEXT: s_clause 0x1
+; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68
+; GFX10-NEXT: v_perm_b32 v1, v1, v30, 0x5040100
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cndmask_b32_e64 v32, v31, v0, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_lshrrev_b32_e32 v31, 16, v31
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v31, v0, vcc_lo
+; GFX10-NEXT: v_perm_b32 v0, v0, v32, 0x5040100
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11TRUE16-LABEL: v_vselect_v32bf16:
@@ -45723,38 +45805,38 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11TRUE16-NEXT: s_clause 0x1f
; GFX11TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32
-; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68
-; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72
-; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76
+; GFX11TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:64
+; GFX11TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:128
+; GFX11TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:60
; GFX11TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124
-; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:128
-; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:64
-; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60
-; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:120
-; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:56
-; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:116
-; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:52
-; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:112
-; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:48
-; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:108
-; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:44
-; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:104
-; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:40
-; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:100
-; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:36
-; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:96
-; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:32
-; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:92
-; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:28
-; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:88
-; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:24
-; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:84
-; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:20
-; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:80
-; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:16
-; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:12
-; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:8
-; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
+; GFX11TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:56
+; GFX11TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:120
+; GFX11TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:52
+; GFX11TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:116
+; GFX11TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:48
+; GFX11TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:112
+; GFX11TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:44
+; GFX11TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:108
+; GFX11TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:40
+; GFX11TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:104
+; GFX11TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:36
+; GFX11TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:100
+; GFX11TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:32
+; GFX11TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:96
+; GFX11TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:28
+; GFX11TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:92
+; GFX11TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:24
+; GFX11TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:88
+; GFX11TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:20
+; GFX11TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:84
+; GFX11TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:16
+; GFX11TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:80
+; GFX11TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:12
+; GFX11TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:76
+; GFX11TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:8
+; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
+; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
+; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
; GFX11TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
; GFX11TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l
@@ -45819,55 +45901,56 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 1, v13.h
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
; GFX11TRUE16-NEXT: v_and_b16 v0.h, 1, v31.l
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s26
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27
+; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v35.h, v34.h, s28
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v36.l, v37.l, s26
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v38.l, s27
-; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v35.h, v38.h, s28
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v39.l, v48.l, s29
-; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v39.h, v48.h, s25
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v49.l, v50.l, s24
-; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v49.h, v50.h, s23
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v51.l, v52.l, s22
-; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v51.h, v52.h, s21
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v53.l, v54.l, s20
-; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v53.h, v54.h, s19
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(15)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v55.l, v64.l, s18
-; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v55.h, v64.h, s17
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(13)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v65.l, v66.l, s16
-; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v65.h, v66.h, s15
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(11)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v67.l, v68.l, s14
-; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v67.h, v68.h, s13
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(9)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v69.l, v70.l, s12
-; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v69.h, v70.h, s11
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(7)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v71.l, v80.l, s10
-; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v71.h, v80.h, s9
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(5)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v81.l, v82.l, s8
-; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v81.h, v82.h, s7
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(3)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v83.l, v84.l, s6
+; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v37.l, v36.l, s29
+; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v37.h, v36.h, s25
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v39.l, v38.l, s24
+; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v39.h, v38.h, s23
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v49.l, v48.l, s22
+; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v49.h, v48.h, s21
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v51.l, v50.l, s20
+; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v51.h, v50.h, s19
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v53.l, v52.l, s18
+; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v53.h, v52.h, s17
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v55.l, v54.l, s16
+; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v55.h, v54.h, s15
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(14)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.l, v65.l, v64.l, s14
+; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v65.h, v64.h, s13
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(12)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v67.l, v66.l, s12
+; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v67.h, v66.h, s11
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(10)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v69.l, v68.l, s10
+; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v69.h, v68.h, s9
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(8)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v71.l, v70.l, s8
+; GFX11TRUE16-NEXT: v_cndmask_b16 v4.h, v71.h, v70.h, s7
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(6)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v81.l, v80.l, s6
+; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v81.h, v80.h, s5
+; GFX11TRUE16-NEXT: s_waitcnt vmcnt(4)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v83.l, v82.l, s4
+; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v83.h, v82.h, s3
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v34.l, v85.l, s4
-; GFX11TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v33.l, v86.l, s2
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v85.l, v84.l, s2
+; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v85.h, v84.h, s1
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v87.l, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v87.l, v86.l, s0
; GFX11TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 1, v0.h
-; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v32.h, v87.h, vcc_lo
-; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v33.h, v86.h, s1
-; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v34.h, v85.h, s3
-; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v83.h, v84.h, s5
-; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v36.h, v37.h, s0
+; GFX11TRUE16-NEXT: v_cndmask_b16 v0.h, v87.h, v86.h, vcc_lo
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v33.h, v32.h, s0
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11FAKE16-LABEL: v_vselect_v32bf16:
@@ -45875,188 +45958,187 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11FAKE16-NEXT: s_clause 0x1f
; GFX11FAKE16-NEXT: scratch_load_u16 v31, off, s32
-; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:128
-; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:64
-; GFX11FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:124
-; GFX11FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:60
-; GFX11FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:120
-; GFX11FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:56
-; GFX11FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:116
-; GFX11FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:52
-; GFX11FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:112
-; GFX11FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:48
-; GFX11FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:108
-; GFX11FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:44
-; GFX11FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:104
-; GFX11FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:40
-; GFX11FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:100
-; GFX11FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:36
-; GFX11FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:96
-; GFX11FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:32
-; GFX11FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:92
-; GFX11FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:28
-; GFX11FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:88
-; GFX11FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:24
-; GFX11FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:84
-; GFX11FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:20
-; GFX11FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:80
-; GFX11FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:16
-; GFX11FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:76
-; GFX11FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:12
-; GFX11FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:72
-; GFX11FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:8
-; GFX11FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:68
-; GFX11FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
+; GFX11FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:64
+; GFX11FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:128
+; GFX11FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:60
+; GFX11FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:124
+; GFX11FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:56
+; GFX11FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:120
+; GFX11FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:52
+; GFX11FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:116
+; GFX11FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:48
+; GFX11FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:112
+; GFX11FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:44
+; GFX11FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:108
+; GFX11FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:40
+; GFX11FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:104
+; GFX11FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:36
+; GFX11FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:100
+; GFX11FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:32
+; GFX11FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:96
+; GFX11FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:28
+; GFX11FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:92
+; GFX11FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:24
+; GFX11FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:88
+; GFX11FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:20
+; GFX11FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:84
+; GFX11FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:16
+; GFX11FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:80
+; GFX11FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:12
+; GFX11FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:76
+; GFX11FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:8
+; GFX11FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
+; GFX11FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
+; GFX11FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
; GFX11FAKE16-NEXT: v_and_b32_e32 v30, 1, v30
; GFX11FAKE16-NEXT: v_and_b32_e32 v28, 1, v28
; GFX11FAKE16-NEXT: v_and_b32_e32 v26, 1, v26
; GFX11FAKE16-NEXT: v_and_b32_e32 v24, 1, v24
; GFX11FAKE16-NEXT: v_and_b32_e32 v22, 1, v22
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v30
-; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11FAKE16-NEXT: v_and_b32_e32 v20, 1, v20
; GFX11FAKE16-NEXT: v_and_b32_e32 v18, 1, v18
; GFX11FAKE16-NEXT: v_and_b32_e32 v16, 1, v16
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(30)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v32, v33, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v30, v33, v32, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
-; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v32
-; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v33
+; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(28)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v34, v35, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v28, v35, v34, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
-; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v34
-; GFX11FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v35
+; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(26)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v36, v37, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v26, v37, v36, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX11FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11FAKE16-NEXT: v_and_b32_e32 v7, 1, v7
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v36
-; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v37
+; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(24)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v38, v39, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v24, v39, v38, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
-; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v38
-; GFX11FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v39
+; GFX11FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(22)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v48, v49, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v22, v49, v48, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX11FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; GFX11FAKE16-NEXT: v_and_b32_e32 v11, 1, v11
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v48
-; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v49
+; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(20)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v50, v51, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v20, v51, v50, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX11FAKE16-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v50
-; GFX11FAKE16-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v51
+; GFX11FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(18)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v52, v53, vcc_lo
-; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX11FAKE16-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v18, v53, v52, vcc_lo
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v52
-; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v53
+; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(16)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v16, v54, v55, vcc_lo
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v16, v55, v54 :: v_dual_and_b32 v15, 1, v15
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v54
-; GFX11FAKE16-NEXT: v_and_b32_e32 v14, 1, v14
-; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v55
+; GFX11FAKE16-NEXT: v_and_b32_e32 v12, 1, v12
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(14)
-; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v14, v64, v65 :: v_dual_and_b32 v19, 1, v19
+; GFX11FAKE16-NEXT: v_dual_cndmask_b32 v14, v65, v64 :: v_dual_and_b32 v17, 1, v17
+; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX11FAKE16-NEXT: v_and_b32_e32 v17, 1, v17
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
+; GFX11FAKE16-NEXT: v_and_b32_e32 v19, 1, v19
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v64, 16, v64
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v65, 16, v65
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(12)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v66, v67, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v12, v67, v66, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
+; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v66, 16, v66
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v67, 16, v67
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(10)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v68, v69, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v10, v69, v68, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX11FAKE16-NEXT: v_and_b32_e32 v21, 1, v21
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69
+; GFX11FAKE16-NEXT: v_and_b32_e32 v23, 1, v23
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v68, 16, v68
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v69, 16, v69
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(8)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v70, v71, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v8, v71, v70, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71
+; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v70, 16, v70
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v71, 16, v71
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(6)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v80, v81, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v6, v81, v80, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX11FAKE16-NEXT: v_and_b32_e32 v25, 1, v25
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81
+; GFX11FAKE16-NEXT: v_and_b32_e32 v27, 1, v27
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v80, 16, v80
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v81, 16, v81
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(4)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v82, v83, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v4, v83, v82, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX11FAKE16-NEXT: v_and_b32_e32 v31, 1, v31
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83
+; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v82, 16, v82
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v83, 16, v83
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v84, v85, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v2, v85, v84, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11FAKE16-NEXT: v_and_b32_e32 v29, 1, v29
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85
+; GFX11FAKE16-NEXT: v_and_b32_e32 v31, 1, v31
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v84, 16, v84
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v85, 16, v85
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v86, v87, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v0, v87, v86, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
-; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v86, 16, v86
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v31, v32, v33, vcc_lo
+; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v87, 16, v87
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v31, v33, v32, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v29
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v34, v35, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v29, v35, v34, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v36, v37, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v27, v37, v36, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v25, v39, v38, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v48, v49, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v23, v49, v48, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v21, v51, v50, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v19, v53, v52, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v17, v55, v54, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v64, v65, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v15, v65, v64, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v66, v67, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v13, v67, v66, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v68, v69, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v11, v69, v68, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v80, v81, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v7, v81, v80, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v84, v85, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v3, v85, v84, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v86, v87, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v1, v87, v86, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v82, v83, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v5, v83, v82, vcc_lo
; GFX11FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
; GFX11FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
-; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v70, v71, vcc_lo
+; GFX11FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v70, vcc_lo
; GFX11FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
; GFX11FAKE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
@@ -46076,38 +46158,38 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX1250TRUE16-NEXT: s_clause 0x20
; GFX1250TRUE16-NEXT: scratch_load_u16 v31, off, s32
-; GFX1250TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:68
-; GFX1250TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:72
-; GFX1250TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:76
+; GFX1250TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:64
+; GFX1250TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:128
+; GFX1250TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:60
; GFX1250TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:124
-; GFX1250TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:128
-; GFX1250TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:64
-; GFX1250TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60
-; GFX1250TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:120
-; GFX1250TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:56
-; GFX1250TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:116
-; GFX1250TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:52
-; GFX1250TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:112
-; GFX1250TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:48
-; GFX1250TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:108
-; GFX1250TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:44
-; GFX1250TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:104
-; GFX1250TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:40
-; GFX1250TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:100
-; GFX1250TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:36
-; GFX1250TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:96
-; GFX1250TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:32
-; GFX1250TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:92
-; GFX1250TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:28
-; GFX1250TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:88
-; GFX1250TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:24
-; GFX1250TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:84
-; GFX1250TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:20
-; GFX1250TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:80
-; GFX1250TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:16
-; GFX1250TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:12
-; GFX1250TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:8
-; GFX1250TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:4
+; GFX1250TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:56
+; GFX1250TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:120
+; GFX1250TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:52
+; GFX1250TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:116
+; GFX1250TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:48
+; GFX1250TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:112
+; GFX1250TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:44
+; GFX1250TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:108
+; GFX1250TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:40
+; GFX1250TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:104
+; GFX1250TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:36
+; GFX1250TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:100
+; GFX1250TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:32
+; GFX1250TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:96
+; GFX1250TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:28
+; GFX1250TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:92
+; GFX1250TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:24
+; GFX1250TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:88
+; GFX1250TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:20
+; GFX1250TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:84
+; GFX1250TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:16
+; GFX1250TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:80
+; GFX1250TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:12
+; GFX1250TRUE16-NEXT: scratch_load_b32 v83, off, s32 offset:76
+; GFX1250TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:8
+; GFX1250TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
+; GFX1250TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
+; GFX1250TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
; GFX1250TRUE16-NEXT: v_and_b16 v0.h, 1, v1.l
; GFX1250TRUE16-NEXT: v_and_b16 v0.l, 1, v0.l
; GFX1250TRUE16-NEXT: v_and_b16 v1.l, 1, v3.l
@@ -46172,219 +46254,225 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
; GFX1250TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 1, v11.h
; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x20
; GFX1250TRUE16-NEXT: v_and_b16 v0.h, 1, v31.l
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x1e
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s26
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x1c
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v14.h, v35.h, v34.h, s28
; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x1a
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v15.l, v36.l, v37.l, s26
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x19
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v38.l, s27
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v14.h, v35.h, v38.h, s28
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x17
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v13.l, v39.l, v48.l, s29
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v13.h, v39.h, v48.h, s25
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x15
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v12.l, v49.l, v50.l, s24
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v12.h, v49.h, v50.h, s23
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x13
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v11.l, v51.l, v52.l, s22
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v11.h, v51.h, v52.h, s21
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x11
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v10.l, v53.l, v54.l, s20
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v10.h, v53.h, v54.h, s19
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xf
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v9.l, v55.l, v64.l, s18
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v9.h, v55.h, v64.h, s17
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xd
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v8.l, v65.l, v66.l, s16
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v8.h, v65.h, v66.h, s15
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xb
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v7.l, v67.l, v68.l, s14
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v7.h, v67.h, v68.h, s13
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x9
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v6.l, v69.l, v70.l, s12
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v6.h, v69.h, v70.h, s11
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x7
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v5.l, v71.l, v80.l, s10
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v5.h, v71.h, v80.h, s9
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x5
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v4.l, v81.l, v82.l, s8
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v4.h, v81.h, v82.h, s7
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x3
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v3.l, v83.l, v84.l, s6
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v13.l, v37.l, v36.l, s29
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v13.h, v37.h, v36.h, s25
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x18
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v12.l, v39.l, v38.l, s24
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v12.h, v39.h, v38.h, s23
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x16
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v11.l, v49.l, v48.l, s22
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v11.h, v49.h, v48.h, s21
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x14
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v10.l, v51.l, v50.l, s20
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v10.h, v51.h, v50.h, s19
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x12
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v9.l, v53.l, v52.l, s18
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v9.h, v53.h, v52.h, s17
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x10
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v8.l, v55.l, v54.l, s16
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v8.h, v55.h, v54.h, s15
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xe
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v7.l, v65.l, v64.l, s14
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v7.h, v65.h, v64.h, s13
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xc
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v6.l, v67.l, v66.l, s12
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v6.h, v67.h, v66.h, s11
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0xa
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v5.l, v69.l, v68.l, s10
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v5.h, v69.h, v68.h, s9
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x8
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v4.l, v71.l, v70.l, s8
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v4.h, v71.h, v70.h, s7
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x6
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v3.l, v81.l, v80.l, s6
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v3.h, v81.h, v80.h, s5
+; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x4
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v2.l, v83.l, v82.l, s4
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v2.h, v83.h, v82.h, s3
; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x2
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v2.l, v34.l, v85.l, s4
-; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x1
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v1.l, v33.l, v86.l, s2
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v1.l, v85.l, v84.l, s2
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v1.h, v85.h, v84.h, s0
; GFX1250TRUE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v0.l, v32.l, v87.l, s1
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v0.l, v87.l, v86.l, s1
; GFX1250TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 1, v0.h
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v0.h, v32.h, v87.h, vcc_lo
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v1.h, v33.h, v86.h, s0
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v2.h, v34.h, v85.h, s3
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v3.h, v83.h, v84.h, s5
-; GFX1250TRUE16-NEXT: v_cndmask_b16 v15.h, v36.h, v37.h, s1
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v0.h, v87.h, v86.h, vcc_lo
+; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250TRUE16-NEXT: v_cndmask_b16 v15.h, v33.h, v32.h, s1
; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
;
; GFX1250FAKE16-LABEL: v_vselect_v32bf16:
; GFX1250FAKE16: ; %bb.0:
; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT: s_clause 0x1b
+; GFX1250FAKE16-NEXT: s_clause 0x19
; GFX1250FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:60
; GFX1250FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:124
; GFX1250FAKE16-NEXT: scratch_load_u16 v33, off, s32
-; GFX1250FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:128
-; GFX1250FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:64
-; GFX1250FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:120
-; GFX1250FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:56
-; GFX1250FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:116
-; GFX1250FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:52
-; GFX1250FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:112
-; GFX1250FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:48
-; GFX1250FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:108
-; GFX1250FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:44
-; GFX1250FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:104
-; GFX1250FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:40
-; GFX1250FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:100
-; GFX1250FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:36
-; GFX1250FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:76
-; GFX1250FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:12
-; GFX1250FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:96
-; GFX1250FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:32
-; GFX1250FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:80
-; GFX1250FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:84
-; GFX1250FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:92
-; GFX1250FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:28
-; GFX1250FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:20
-; GFX1250FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:88
-; GFX1250FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:24
+; GFX1250FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:64
+; GFX1250FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:128
+; GFX1250FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:56
+; GFX1250FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:120
+; GFX1250FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:52
+; GFX1250FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:116
+; GFX1250FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:48
+; GFX1250FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:112
+; GFX1250FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:44
+; GFX1250FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:108
+; GFX1250FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:40
+; GFX1250FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:104
+; GFX1250FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:36
+; GFX1250FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:100
+; GFX1250FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:32
+; GFX1250FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:96
+; GFX1250FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:28
+; GFX1250FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:92
+; GFX1250FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:24
+; GFX1250FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:88
+; GFX1250FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:20
+; GFX1250FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:84
+; GFX1250FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:16
; GFX1250FAKE16-NEXT: v_and_b32_e32 v30, 1, v30
; GFX1250FAKE16-NEXT: v_and_b32_e32 v29, 1, v29
+; GFX1250FAKE16-NEXT: s_clause 0x1
+; GFX1250FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:80
+; GFX1250FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:12
; GFX1250FAKE16-NEXT: v_and_b32_e32 v26, 1, v26
; GFX1250FAKE16-NEXT: v_and_b32_e32 v24, 1, v24
; GFX1250FAKE16-NEXT: v_and_b32_e32 v22, 1, v22
; GFX1250FAKE16-NEXT: v_and_b32_e32 v20, 1, v20
; GFX1250FAKE16-NEXT: v_and_b32_e32 v18, 1, v18
; GFX1250FAKE16-NEXT: v_and_b32_e32 v16, 1, v16
+; GFX1250FAKE16-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX1250FAKE16-NEXT: v_and_b32_e32 v12, 1, v12
; GFX1250FAKE16-NEXT: v_and_b32_e32 v10, 1, v10
+; GFX1250FAKE16-NEXT: v_and_b32_e32 v8, 1, v8
; GFX1250FAKE16-NEXT: v_and_b32_e32 v6, 1, v6
; GFX1250FAKE16-NEXT: v_and_b32_e32 v4, 1, v4
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX1250FAKE16-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 1, v0
; GFX1250FAKE16-NEXT: v_and_b32_e32 v3, 1, v3
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX1250FAKE16-NEXT: v_and_b32_e32 v25, 1, v25
; GFX1250FAKE16-NEXT: v_and_b32_e32 v23, 1, v23
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v13, 1, v13
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v15, 1, v15
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v21, 1, v21
; GFX1250FAKE16-NEXT: v_and_b32_e32 v11, 1, v11
-; GFX1250FAKE16-NEXT: v_and_b32_e32 v19, 1, v19
; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x1a
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v83, 16, v32 :: v_dual_bitop2_b32 v17, 1, v17 bitop3:0x40
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v83, 16, v32 :: v_dual_bitop2_b32 v15, 1, v15 bitop3:0x40
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e64 s1, 1, v30
; GFX1250FAKE16-NEXT: v_and_b32_e32 v28, 1, v28
; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x17
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v30, v34, v35, s1 :: v_dual_bitop2_b32 v33, 1, v33 bitop3:0x40
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v30, v35, v34, s1 :: v_dual_bitop2_b32 v33, 1, v33 bitop3:0x40
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v28
; GFX1250FAKE16-NEXT: v_lshrrev_b32_e32 v28, 16, v31
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e64 s0, 1, v29
-; GFX1250FAKE16-NEXT: scratch_load_b32 v29, off, s32 offset:16
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v35, 16, v35 :: v_dual_lshrrev_b32 v34, 16, v34
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v31, v32, v31, vcc_lo
-; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v33
-; GFX1250FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:72
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e64 v28, v83, v28, s0
-; GFX1250FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:4
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v34, v34, v35, vcc_lo
+; GFX1250FAKE16-NEXT: scratch_load_b32 v29, off, s32 offset:76
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v34, 16, v34 :: v_dual_lshrrev_b32 v35, 16, v35
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v28, v83, v28, s0 :: v_dual_cndmask_b32 v31, v32, v31, vcc_lo
; GFX1250FAKE16-NEXT: s_clause 0x1
-; GFX1250FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:68
-; GFX1250FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:8
+; GFX1250FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:8
+; GFX1250FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:68
+; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v33
+; GFX1250FAKE16-NEXT: scratch_load_b32 v33, off, s32 offset:72
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v34, v35, v34, vcc_lo
+; GFX1250FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:4
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v26
; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x1a
-; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v26, v36, v37, vcc_lo :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v26, v37, v36, vcc_lo :: v_dual_bitop2_b32 v1, 1, v1 bitop3:0x40
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v24
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v37, 16, v37 :: v_dual_bitop2_b32 v2, 1, v2 bitop3:0x40
+; GFX1250FAKE16-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v37, 16, v37 :: v_dual_lshrrev_b32 v36, 16, v36
; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x18
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v36, 16, v36 :: v_dual_cndmask_b32 v24, v38, v39, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v24, v39, v38, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v22
; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v38, 16, v38 :: v_dual_bitop2_b32 v7, 1, v7 bitop3:0x40
; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x16
-; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v22, v48, v49 :: v_dual_lshrrev_b32 v39, 16, v39
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v22, v49, v48 :: v_dual_lshrrev_b32 v39, 16, v39
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v20
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v49, 16, v49 :: v_dual_bitop2_b32 v8, 1, v8 bitop3:0x40
+; GFX1250FAKE16-NEXT: v_and_b32_e32 v9, 1, v9
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v49, 16, v49 :: v_dual_lshrrev_b32 v48, 16, v48
; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x14
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v48, 16, v48 :: v_dual_cndmask_b32 v20, v50, v51, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v20, v51, v50, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v18
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v51, 16, v51 :: v_dual_bitop2_b32 v12, 1, v12 bitop3:0x40
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v51, 16, v51 :: v_dual_bitop2_b32 v13, 1, v13 bitop3:0x40
; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x12
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v50, 16, v50 :: v_dual_cndmask_b32 v18, v52, v53, vcc_lo
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v50, 16, v50 :: v_dual_cndmask_b32 v18, v53, v52, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v53, 16, v53 :: v_dual_bitop2_b32 v14, 1, v14 bitop3:0x40
+; GFX1250FAKE16-NEXT: v_and_b32_e32 v17, 1, v17
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v53, 16, v53 :: v_dual_lshrrev_b32 v52, 16, v52
; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x10
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v52, 16, v52 :: v_dual_cndmask_b32 v16, v54, v55, vcc_lo
-; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v16, v55, v54 :: v_dual_lshrrev_b32 v55, 16, v55
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v14
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v55, 16, v55 :: v_dual_lshrrev_b32 v54, 16, v54
-; GFX1250FAKE16-NEXT: s_wait_loadcnt 0xc
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v14, v66, v67, vcc_lo
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v54, 16, v54 :: v_dual_bitop2_b32 v19, 1, v19 bitop3:0x40
+; GFX1250FAKE16-NEXT: s_wait_loadcnt 0xe
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v14, v65, v64, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v67, 16, v67 :: v_dual_lshrrev_b32 v66, 16, v66
-; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x8
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v12, v70, v71, vcc_lo
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v64, 16, v64 :: v_dual_bitop2_b32 v21, 1, v21 bitop3:0x40
+; GFX1250FAKE16-NEXT: s_wait_loadcnt 0xc
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v12, v67, v66 :: v_dual_lshrrev_b32 v65, 16, v65
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v10
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v70, 16, v70 :: v_dual_bitop2_b32 v25, 1, v25 bitop3:0x40
-; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x5
-; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v10, v81, v82 :: v_dual_lshrrev_b32 v71, 16, v71
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v66, 16, v66 :: v_dual_lshrrev_b32 v67, 16, v67
+; GFX1250FAKE16-NEXT: s_wait_loadcnt 0xa
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v10, v69, v68, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v8
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v82, 16, v82 :: v_dual_bitop2_b32 v27, 1, v27 bitop3:0x40
-; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v8, v69, v80 :: v_dual_lshrrev_b32 v81, 16, v81
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v68, 16, v68 :: v_dual_bitop2_b32 v27, 1, v27 bitop3:0x40
+; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x8
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v8, v71, v70 :: v_dual_lshrrev_b32 v69, 16, v69
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v6
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v80, 16, v80 :: v_dual_lshrrev_b32 v69, 16, v69
-; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x4
-; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v6, v68, v29 :: v_dual_lshrrev_b32 v29, 16, v29
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v70, 16, v70 :: v_dual_lshrrev_b32 v71, 16, v71
+; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x6
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v6, v81, v80, vcc_lo :: v_dual_lshrrev_b32 v80, 16, v80
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v68, 16, v68 :: v_dual_cndmask_b32 v4, v64, v65, vcc_lo
+; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x4
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v4, v29, v82 :: v_dual_lshrrev_b32 v81, 16, v81
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v65, 16, v65 :: v_dual_lshrrev_b32 v64, 16, v64
-; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x0
-; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v2, v32, v33 :: v_dual_lshrrev_b32 v33, 16, v33
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v82, 16, v82 :: v_dual_lshrrev_b32 v29, 16, v29
+; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x1
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v2, v33, v32, vcc_lo :: v_dual_lshrrev_b32 v32, 16, v32
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v32, 16, v32 :: v_dual_cndmask_b32 v0, v35, v83, vcc_lo
+; GFX1250FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250FAKE16-NEXT: v_dual_cndmask_b32 v0, v83, v35 :: v_dual_lshrrev_b32 v33, 16, v33
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v27
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v83, 16, v83 :: v_dual_cndmask_b32 v27, v36, v37, vcc_lo
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v35, 16, v35 :: v_dual_cndmask_b32 v27, v37, v36, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v25
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v25, v38, v39, vcc_lo
+; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v83, 16, v83 :: v_dual_cndmask_b32 v25, v39, v38, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v23
-; GFX1250FAKE16-NEXT: v_dual_lshrrev_b32 v35, 16, v35 :: v_dual_cndmask_b32 v23, v48, v49, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v23, v49, v48, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v21
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v21, v50, v51, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v21, v51, v50, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v19
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v19, v52, v53, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v19, v53, v52, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v17
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v17, v54, v55, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v17, v55, v54, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v15
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v15, v66, v67, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v15, v65, v64, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v13
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v13, v70, v71, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v13, v67, v66, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v11
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v11, v81, v82, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v11, v69, v68, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v7
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v7, v68, v29, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v7, v81, v80, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v3, v32, v33, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v3, v33, v32, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v1, v35, v83, vcc_lo
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v1, v83, v35, vcc_lo
; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v5, v64, v65, vcc_lo
-; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
-; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v9, v69, v80, vcc_lo
+; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX1250FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v5, v29, v82, vcc_lo
+; GFX1250FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v9
; GFX1250FAKE16-NEXT: v_perm_b32 v1, v3, v2, 0x5040100
; GFX1250FAKE16-NEXT: v_perm_b32 v2, v5, v4, 0x5040100
+; GFX1250FAKE16-NEXT: v_cndmask_b32_e32 v9, v71, v70, vcc_lo
; GFX1250FAKE16-NEXT: v_perm_b32 v3, v7, v6, 0x5040100
-; GFX1250FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
; GFX1250FAKE16-NEXT: v_perm_b32 v5, v11, v10, 0x5040100
; GFX1250FAKE16-NEXT: v_perm_b32 v6, v13, v12, 0x5040100
; GFX1250FAKE16-NEXT: v_perm_b32 v7, v15, v14, 0x5040100
+; GFX1250FAKE16-NEXT: v_perm_b32 v4, v9, v8, 0x5040100
; GFX1250FAKE16-NEXT: v_perm_b32 v8, v17, v16, 0x5040100
; GFX1250FAKE16-NEXT: v_perm_b32 v9, v19, v18, 0x5040100
; GFX1250FAKE16-NEXT: v_perm_b32 v10, v21, v20, 0x5040100
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-extract-concat.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-extract-concat.ll
new file mode 100644
index 0000000000000..819284f043630
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-extract-concat.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O2 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 \
+; RUN: -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck %s \
+; RUN: --check-prefix=COMBINE --implicit-check-not=REG_SEQUENCE
+; RUN: llc -O2 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 \
+; RUN: -verify-machineinstrs -combiner-disabled -stop-after=amdgpu-isel < %s \
+; RUN: | FileCheck %s --check-prefix=NOCOMBINE
+
+declare <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32>, i64 immarg)
+
+define <4 x i32> @extract_concat_multi_operand(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) nounwind {
+ ; COMBINE-LABEL: name: extract_concat_multi_operand
+ ; COMBINE: bb.0 (%ir-block.0):
+ ; COMBINE-NEXT: liveins: $vgpr4, $vgpr5, $vgpr6, $vgpr7
+ ; COMBINE-NEXT: {{ $}}
+ ; COMBINE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; COMBINE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; COMBINE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; COMBINE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; COMBINE-NEXT: $vgpr0 = COPY [[COPY3]]
+ ; COMBINE-NEXT: $vgpr1 = COPY [[COPY2]]
+ ; COMBINE-NEXT: $vgpr2 = COPY [[COPY1]]
+ ; COMBINE-NEXT: $vgpr3 = COPY [[COPY]]
+ ; COMBINE-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ ;
+ ; NOCOMBINE-LABEL: name: extract_concat_multi_operand
+ ; NOCOMBINE: bb.0 (%ir-block.0):
+ ; NOCOMBINE-NEXT: liveins: $vgpr4, $vgpr5, $vgpr6, $vgpr7
+ ; NOCOMBINE-NEXT: {{ $}}
+ ; NOCOMBINE-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; NOCOMBINE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; NOCOMBINE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; NOCOMBINE-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; NOCOMBINE-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2, [[COPY]], %subreg.sub3
+ ; NOCOMBINE-NEXT: [[COPY4:%[0-9]+]]:av_32 = COPY [[REG_SEQUENCE]].sub0
+ ; NOCOMBINE-NEXT: [[COPY5:%[0-9]+]]:av_32 = COPY [[REG_SEQUENCE]].sub1
+ ; NOCOMBINE-NEXT: [[COPY6:%[0-9]+]]:av_32 = COPY [[REG_SEQUENCE]].sub2
+ ; NOCOMBINE-NEXT: [[COPY7:%[0-9]+]]:av_32 = COPY [[REG_SEQUENCE]].sub3
+ ; NOCOMBINE-NEXT: $vgpr0 = COPY [[COPY4]]
+ ; NOCOMBINE-NEXT: $vgpr1 = COPY [[COPY5]]
+ ; NOCOMBINE-NEXT: $vgpr2 = COPY [[COPY6]]
+ ; NOCOMBINE-NEXT: $vgpr3 = COPY [[COPY7]]
+ ; NOCOMBINE-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
+ %lo = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %hi = shufflevector <2 x i32> %c, <2 x i32> %d, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ %wide = shufflevector <4 x i32> %lo, <4 x i32> %hi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %ext = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %wide, i64 4)
+ ret <4 x i32> %ext
+}
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 5b279e5953629..20d42192b993a 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -163,20 +163,20 @@ define <8 x i64> @combine_zext_pmuludq_256(<8 x i32> %a) {
;
; AVX1-LABEL: combine_zext_pmuludq_256:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [715827883,715827883]
; AVX1-NEXT: # xmm4 = mem[0,0]
+; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm2
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: combine_zext_pmuludq_256:
diff --git a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
index 019496c504887..7584acc3654b2 100644
--- a/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
+++ b/llvm/test/CodeGen/X86/ifma-combine-vpmadd52.ll
@@ -447,22 +447,22 @@ define <16 x i64> @test_1024_combine_split(<16 x i64> %x, <16 x i64> %y, <16 x i
; AVX512-NOIFMA-LABEL: test_1024_combine_split:
; AVX512-NOIFMA: # %bb.0:
; AVX512-NOIFMA-NEXT: vpbroadcastq {{.*#+}} zmm6 = [67108863,67108863,67108863,67108863,67108863,67108863,67108863,67108863]
-; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm1, %zmm1
; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm0, %zmm0
-; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm1, %zmm1
; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm2, %zmm2
-; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm6
-; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm7
-; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm4, %ymm8
+; AVX512-NOIFMA-NEXT: vpandq %zmm6, %zmm3, %zmm3
+; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm3, %ymm6
+; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm7
+; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm5, %ymm8
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm6, %ymm7, %ymm8
-; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
-; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm0
-; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm3, %ymm2
-; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm5, %ymm6
-; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm4, %ymm6
; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm3, %ymm1, %ymm5
-; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm1
+; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm2, %ymm1
+; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512-NOIFMA-NEXT: vextracti64x4 $1, %zmm4, %ymm6
+; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm1, %ymm3, %ymm6
+; AVX512-NOIFMA-NEXT: {vex} vpmadd52luq %ymm2, %ymm0, %ymm4
+; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm0
+; AVX512-NOIFMA-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm1
; AVX512-NOIFMA-NEXT: retq
%x_masked = and <16 x i64> %x, splat (i64 67108863)
%y_masked = and <16 x i64> %y, splat (i64 67108863)
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index fd67fdcbabadf..7093ecbcf7777 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -1861,16 +1861,16 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
;
; AVX1-LABEL: pmaddwd_32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: pmaddwd_32:
@@ -2067,16 +2067,16 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
;
; AVX1-LABEL: jumbled_indices16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vpmaddwd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vpmaddwd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: jumbled_indices16:
@@ -2131,26 +2131,26 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
;
; AVX1-LABEL: jumbled_indices32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9
; AVX1-NEXT: vpmaddwd %xmm8, %xmm9, %xmm8
-; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm8
-; AVX1-NEXT: vpmaddwd %xmm4, %xmm8, %xmm4
-; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmaddwd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
+; AVX1-NEXT: vpmaddwd %xmm7, %xmm9, %xmm7
+; AVX1-NEXT: vpmaddwd %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX1-NEXT: vpmaddwd %xmm6, %xmm9, %xmm6
+; AVX1-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm9
+; AVX1-NEXT: vpmaddwd %xmm5, %xmm9, %xmm5
+; AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3
; AVX1-NEXT: retq
;
; AVX2-LABEL: jumbled_indices32:
@@ -2163,16 +2163,16 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
;
; AVX512F-LABEL: jumbled_indices32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm4
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm5
; AVX512F-NEXT: vpmaddwd %ymm4, %ymm5, %ymm4
-; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
-; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512F-NEXT: vpmaddwd %ymm2, %ymm4, %ymm2
; AVX512F-NEXT: vpmaddwd %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
+; AVX512F-NEXT: vpmaddwd %ymm3, %ymm5, %ymm3
+; AVX512F-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: jumbled_indices32:
@@ -2277,12 +2277,12 @@ define <16 x i32> @pmaddwd_512(ptr %Aptr, ptr %Bptr) {
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm3
+; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2
; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1
; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1
-; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: pmaddwd_512:
@@ -2355,25 +2355,25 @@ define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) {
;
; AVX1-LABEL: pmaddwd_1024:
; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX1-NEXT: vpmaddwd 112(%rsi), %xmm0, %xmm3
+; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
+; AVX1-NEXT: vpmaddwd 96(%rsi), %xmm0, %xmm4
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm0
+; AVX1-NEXT: vpmaddwd 80(%rsi), %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa 64(%rdi), %xmm0
+; AVX1-NEXT: vpmaddwd 64(%rsi), %xmm0, %xmm5
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
+; AVX1-NEXT: vmovdqa 32(%rdi), %xmm6
+; AVX1-NEXT: vmovdqa 48(%rdi), %xmm7
+; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm7, %xmm7
+; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm6, %xmm6
; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1
; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1
-; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX1-NEXT: vpmaddwd 80(%rsi), %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX1-NEXT: vpmaddwd 64(%rsi), %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm3
-; AVX1-NEXT: vpmaddwd 112(%rsi), %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX1-NEXT: vpmaddwd 96(%rsi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: retq
;
@@ -2395,12 +2395,12 @@ define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) {
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512F-NEXT: vmovdqa 64(%rdi), %ymm2
; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm3
+; AVX512F-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm3
+; AVX512F-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2
; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm1
-; AVX512F-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: pmaddwd_1024:
diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll
index d6c9877cd99b6..c4f71e3aaa44f 100644
--- a/llvm/test/CodeGen/X86/pmaddubsw.ll
+++ b/llvm/test/CodeGen/X86/pmaddubsw.ll
@@ -116,25 +116,25 @@ define <64 x i16> @pmaddubsw_512(ptr %Aptr, ptr %Bptr) {
;
; AVX1-LABEL: pmaddubsw_512:
; AVX1: # %bb.0:
+; AVX1-NEXT: vmovdqa 112(%rsi), %xmm0
+; AVX1-NEXT: vpmaddubsw 112(%rdi), %xmm0, %xmm3
+; AVX1-NEXT: vmovdqa 96(%rsi), %xmm0
+; AVX1-NEXT: vpmaddubsw 96(%rdi), %xmm0, %xmm4
+; AVX1-NEXT: vmovdqa 80(%rsi), %xmm0
+; AVX1-NEXT: vpmaddubsw 80(%rdi), %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa 64(%rsi), %xmm0
+; AVX1-NEXT: vpmaddubsw 64(%rdi), %xmm0, %xmm5
; AVX1-NEXT: vmovdqa (%rsi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rsi), %xmm1
-; AVX1-NEXT: vmovdqa 32(%rsi), %xmm2
-; AVX1-NEXT: vmovdqa 48(%rsi), %xmm3
+; AVX1-NEXT: vmovdqa 32(%rsi), %xmm6
+; AVX1-NEXT: vmovdqa 48(%rsi), %xmm7
+; AVX1-NEXT: vpmaddubsw 48(%rdi), %xmm7, %xmm7
+; AVX1-NEXT: vpmaddubsw 32(%rdi), %xmm6, %xmm6
; AVX1-NEXT: vpmaddubsw 16(%rdi), %xmm1, %xmm1
; AVX1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpmaddubsw 48(%rdi), %xmm3, %xmm1
-; AVX1-NEXT: vpmaddubsw 32(%rdi), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vmovdqa 80(%rsi), %xmm2
-; AVX1-NEXT: vpmaddubsw 80(%rdi), %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa 64(%rsi), %xmm3
-; AVX1-NEXT: vpmaddubsw 64(%rdi), %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vmovdqa 112(%rsi), %xmm3
-; AVX1-NEXT: vpmaddubsw 112(%rdi), %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa 96(%rsi), %xmm4
-; AVX1-NEXT: vpmaddubsw 96(%rdi), %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
; AVX1-NEXT: retq
;
@@ -156,12 +156,12 @@ define <64 x i16> @pmaddubsw_512(ptr %Aptr, ptr %Bptr) {
; AVX512F-NEXT: vmovdqa 32(%rsi), %ymm1
; AVX512F-NEXT: vmovdqa 64(%rsi), %ymm2
; AVX512F-NEXT: vmovdqa 96(%rsi), %ymm3
+; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm3
+; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2
; AVX512F-NEXT: vpmaddubsw 32(%rdi), %ymm1, %ymm1
; AVX512F-NEXT: vpmaddubsw (%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmaddubsw 96(%rdi), %ymm3, %ymm1
-; AVX512F-NEXT: vpmaddubsw 64(%rdi), %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: pmaddubsw_512:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index 8b49aefa149e9..bc7ed7552e77c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -485,32 +485,32 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13]
-; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm7
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
-; AVX512-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,3,7,0]
+; AVX512-NEXT: vpermi2d %xmm6, %xmm7, %xmm8
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
+; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa %xmm6, 32(%rax)
+; AVX512-NEXT: vmovdqa %ymm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -519,32 +519,33 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8,10,1,3,8,10,1,3]
-; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
-; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,8,8,0,0,8,8,0]
+; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermi2d %ymm5, %ymm4, %ymm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,7,0]
+; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm1
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,5,6,3]
+; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm0
+; AVX512-FCP-NEXT: vmovdqa %xmm2, 32(%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -553,32 +554,32 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,8,9,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,20,21,28,29,4,5,12,13]
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm7
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2,3,4,5],ymm5[6],ymm7[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm1
-; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,3,7,0]
+; AVX512DQ-NEXT: vpermi2d %xmm6, %xmm7, %xmm8
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm1
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm0
+; AVX512DQ-NEXT: vmovdqa %xmm6, 32(%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -587,32 +588,33 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm3[0],xmm2[0]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [8,10,1,3,8,10,1,3]
-; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,8,9,0,1,4,5,u,u,u,u,2,3,10,11,2,3,10,11,u,u,u,u,24,25,28,29,4,5,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5],ymm5[6],ymm6[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,8,9,0,1,8,9,4,5,12,13,2,3,10,11,18,19,26,27,24,25,30,31,20,21,28,29,20,21,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,8,8,0,0,8,8,0]
+; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm5, %ymm4, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,7,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,5,6,3]
+; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 32(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -621,19 +623,19 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,32,40,0,0,1,9,33,41,0,0,2,10,34,42,0,0,3,11,35,43,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,32,40,6,7,8,9,33,41,12,13,14,15,34,42,18,19,20,21,35,43,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512BW-NEXT: vmovdqa %ymm2, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -642,19 +644,19 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,32,40,0,0,1,9,33,41,0,0,2,10,34,42,0,0,3,11,35,43,0,0,0,0,0,0,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,32,40,6,7,8,9,33,41,12,13,14,15,34,42,18,19,20,21,35,43,0,0,0,0,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -663,19 +665,19 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,32,40,0,0,1,9,33,41,0,0,2,10,34,42,0,0,3,11,35,43,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,32,40,6,7,8,9,33,41,12,13,14,15,34,42,18,19,20,21,35,43,0,0,0,0,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
+; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -684,19 +686,19 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,0,0,0,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm0, 32(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,8,32,40,0,0,1,9,33,41,0,0,2,10,34,42,0,0,3,11,35,43,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,32,40,6,7,8,9,33,41,12,13,14,15,34,42,18,19,20,21,35,43,0,0,0,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 9918958f7276c..a8d7d5040bf51 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -137,168 +137,168 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-LABEL: store_i32_stride6_vf2:
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
-; AVX512-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512-NEXT: vextractf32x4 $2, %zmm0, 32(%rax)
-; AVX512-NEXT: vmovaps %ymm0, (%rax)
+; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,16,20,0,0,1,5,17,21,0,0,0,0,0,0]
+; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,20,6,7,8,9,17,21,0,0,0,0]
+; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512-NEXT: vmovdqa %ymm2, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i32_stride6_vf2:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
-; AVX512-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax)
-; AVX512-FCP-NEXT: vmovaps %ymm0, (%rax)
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,16,20,0,0,1,5,17,21,0,0,0,0,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,20,6,7,8,9,17,21,0,0,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i32_stride6_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
-; AVX512DQ-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vextractf32x4 $2, %zmm0, 32(%rax)
-; AVX512DQ-NEXT: vmovaps %ymm0, (%rax)
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,16,20,0,0,1,5,17,21,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,20,6,7,8,9,17,21,0,0,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512DQ-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i32_stride6_vf2:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax)
-; AVX512DQ-FCP-NEXT: vmovaps %ymm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,16,20,0,0,1,5,17,21,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,20,6,7,8,9,17,21,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i32_stride6_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
-; AVX512BW-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rax)
-; AVX512BW-NEXT: vmovaps %ymm0, (%rax)
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,16,20,0,0,1,5,17,21,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,20,6,7,8,9,17,21,0,0,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512BW-NEXT: vmovdqa %ymm2, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: store_i32_stride6_vf2:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax)
-; AVX512BW-FCP-NEXT: vmovaps %ymm0, (%rax)
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,16,20,0,0,1,5,17,21,0,0,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,20,6,7,8,9,17,21,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: store_i32_stride6_vf2:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-BW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-NEXT: vextractf32x4 $2, %zmm0, 32(%rax)
-; AVX512DQ-BW-NEXT: vmovaps %ymm0, (%rax)
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,16,20,0,0,1,5,17,21,0,0,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,20,6,7,8,9,17,21,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: store_i32_stride6_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-BW-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vextractf32x4 $2, %zmm0, 32(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm0, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm3, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,4,16,20,0,0,1,5,17,21,0,0,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm5, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,16,20,6,7,8,9,17,21,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $2, %zmm2, 32(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i32>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 3e895ab28ee2a..d6c65fa82fa6c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -783,30 +783,32 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13]
-; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u]
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u]
-; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
-; AVX512-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u]
+; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3]
+; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u],zero,zero,ymm2[1,9,u,u],zero,zero,ymm2[2,10,u,u],zero,zero,ymm2[19,27,u,u],zero,zero,ymm2[20,28,u,u],zero,zero
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,ymm0[u,u,1,9],zero,zero,ymm0[u,u,2,10],zero,zero,ymm0[u,u,19,27],zero,zero,ymm0[u,u,20,28],zero,zero,ymm0[u,u,21,29]
+; AVX512-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & mem)
+; AVX512-NEXT: vmovdqa %ymm2, (%rax)
+; AVX512-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -815,30 +817,32 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u]
-; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
-; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u],zero,zero,ymm2[1,9,u,u],zero,zero,ymm2[2,10,u,u],zero,zero,ymm2[19,27,u,u],zero,zero,ymm2[20,28,u,u],zero,zero
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,ymm0[u,u,1,9],zero,zero,ymm0[u,u,2,10],zero,zero,ymm0[u,u,19,27],zero,zero,ymm0[u,u,20,28],zero,zero,ymm0[u,u,21,29]
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & mem)
+; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -847,30 +851,32 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u]
-; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
-; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u],zero,zero,ymm2[1,9,u,u],zero,zero,ymm2[2,10,u,u],zero,zero,ymm2[19,27,u,u],zero,zero,ymm2[20,28,u,u],zero,zero
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,ymm0[u,u,1,9],zero,zero,ymm0[u,u,2,10],zero,zero,ymm0[u,u,19,27],zero,zero,ymm0[u,u,20,28],zero,zero,ymm0[u,u,21,29]
+; AVX512DQ-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & mem)
+; AVX512DQ-NEXT: vmovdqa %ymm2, (%rax)
+; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -879,30 +885,32 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (mem & (ymm4 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u,u],zero,zero,ymm2[1,9,u,u],zero,zero,ymm2[2,10,u,u],zero,zero,ymm2[19,27,u,u],zero,zero,ymm2[20,28,u,u],zero,zero
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,ymm0[u,u,1,9],zero,zero,ymm0[u,u,2,10],zero,zero,ymm0[u,u,19,27],zero,zero,ymm0[u,u,20,28],zero,zero,ymm0[u,u,21,29]
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[18,26],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & mem)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -911,32 +919,35 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13]
-; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u]
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3]
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,ymm0[21,29]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[2,10],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero,zero,zero
+; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u]
; AVX512BW-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u]
-; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
-; AVX512BW-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512BW-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa %xmm1, 32(%rax)
+; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -945,32 +956,35 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u]
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u]
+; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3]
+; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,ymm0[21,29]
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[2,10],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u]
; AVX512BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -979,32 +993,35 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13]
-; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u]
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u]
+; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,ymm0[21,29]
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[2,10],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u]
; AVX512DQ-BW-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
-; AVX512DQ-BW-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 32(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -1013,32 +1030,35 @@ define void @store_i8_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,8,0,8,u,u,1,9,1,9,u,u,2,10,2,10,u,u,3,11,3,11,u,u,4,12,4,12,u,u,5,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6],ymm3[7],ymm5[8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13,14],ymm3[15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[2,10,1,9,0,8,3,11,u,u,u,u,4,12,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [5,13,u,u,6,14,6,14,u,u,7,15,7,15,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,2,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm5[0]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,0,2]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,8],zero,zero,zero,zero,ymm0[1,9],zero,zero,zero,zero,ymm0[2,10],zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,ymm0[20,28],zero,zero,zero,zero,ymm0[21,29]
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,ymm2[2,10],zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,ymm2[20,28],zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,0,2]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,8,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,19,27,u,u,u,u,20,28,u,u]
; AVX512DQ-BW-FCP-NEXT: movw $18724, %cx # imm = 0x4924
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm3 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,u,6,14],zero,zero,xmm0[u,u,7,15],zero,zero,xmm0[u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[5,13,u,u],zero,zero,xmm1[6,14,u,u],zero,zero,xmm1[7,15,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,5,13,u,u,u,u,6,14,u,u,u,u,7,15]
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 32(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 32(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <8 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index aaeab617d56c8..76dfd019c0883 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -3207,45 +3207,47 @@ define void @mask_replication_factor6_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou
; AVX512BW-ONLY-NEXT: vpmovm2w %k1, %zmm0
; AVX512BW-ONLY-NEXT: vpmovm2w %k0, %zmm1
; AVX512BW-ONLY-NEXT: movl $1, %eax
-; AVX512BW-ONLY-NEXT: kmovd %eax, %k1
-; AVX512BW-ONLY-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
-; AVX512BW-ONLY-NEXT: vpmovw2m %zmm1, %k1
-; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z}
-; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k1
-; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm2 {%k1} {z}
+; AVX512BW-ONLY-NEXT: kmovd %eax, %k2
+; AVX512BW-ONLY-NEXT: vmovdqu16 %zmm0, %zmm1 {%k2}
+; AVX512BW-ONLY-NEXT: vpmovw2m %zmm1, %k2
+; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT: kshiftrd $16, %k2, %k2
+; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k2} {z}
+; AVX512BW-ONLY-NEXT: vpmovm2w %k1, %zmm2
; AVX512BW-ONLY-NEXT: vpmovsxbw {{.*#+}} zmm3 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
-; AVX512BW-ONLY-NEXT: vpermw %zmm0, %zmm3, %zmm0
-; AVX512BW-ONLY-NEXT: vpmovw2m %zmm0, %k1
+; AVX512BW-ONLY-NEXT: vpermw %zmm2, %zmm3, %zmm2
+; AVX512BW-ONLY-NEXT: vpmovw2m %zmm2, %k1
; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2
-; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm0 {%k2} {z}
+; AVX512BW-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm2 {%k2} {z}
; AVX512BW-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm3 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $48, %k0, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm4 {%k1} {z}
; AVX512BW-ONLY-NEXT: kshiftrq $32, %k0, %k1
; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm5 {%k1} {z}
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 64(%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, 64(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm5, 128(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm4, 192(%rdx)
; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 256(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 320(%rdx)
-; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 320(%rdx)
+; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx)
; AVX512BW-ONLY-NEXT: vzeroupper
; AVX512BW-ONLY-NEXT: retq
;
; AVX512VBMI-ONLY-LABEL: mask_replication_factor6_vf16:
; AVX512VBMI-ONLY: # %bb.0:
-; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k1
-; AVX512VBMI-ONLY-NEXT: vpmovm2b %k1, %zmm0
+; AVX512VBMI-ONLY-NEXT: kmovw (%rdi), %k2
+; AVX512VBMI-ONLY-NEXT: vpmovm2b %k2, %zmm0
; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6,7,7,7,7,7,7,8,8,8,8,8,8,9,9,9,9,9,9,10,10,10,10]
; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k0
; AVX512VBMI-ONLY-NEXT: vpmovm2w %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT: vpmovm2w %k1, %zmm1
+; AVX512VBMI-ONLY-NEXT: vpmovm2w %k2, %zmm1
; AVX512VBMI-ONLY-NEXT: movl $1, %eax
; AVX512VBMI-ONLY-NEXT: kmovd %eax, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm0, %k1
; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z}
+; AVX512VBMI-ONLY-NEXT: vpmovm2w %k2, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovsxbw {{.*#+}} zmm2 = [10,10,11,11,11,11,11,11,12,12,12,12,12,12,13,13,13,13,13,13,14,14,14,14,14,14,15,15,15,15,15,15]
; AVX512VBMI-ONLY-NEXT: vpermw %zmm1, %zmm2, %zmm1
; AVX512VBMI-ONLY-NEXT: vpmovw2m %zmm1, %k2
diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll
index f8cde4cf223a7..3d3e624479262 100644
--- a/llvm/test/CodeGen/X86/widen_fadd.ll
+++ b/llvm/test/CodeGen/X86/widen_fadd.ll
@@ -187,84 +187,44 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
;
-; AVX512F-LABEL: widen_fadd_v2f32_v16f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm5, %xmm4, %xmm4
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512F-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: widen_fadd_v2f32_v16f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm5, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm7, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
-; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
-; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: widen_fadd_v2f32_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vaddps %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vaddps %xmm4, %xmm1, %xmm1
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vaddps %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vaddps %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
+; AVX512-NEXT: vaddps %xmm5, %xmm4, %xmm4
+; AVX512-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
+; AVX512-NEXT: vaddps %xmm6, %xmm5, %xmm5
+; AVX512-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
+; AVX512-NEXT: vaddps %xmm7, %xmm6, %xmm6
+; AVX512-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
+; AVX512-NEXT: vaddps %xmm7, %xmm8, %xmm7
+; AVX512-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
+; AVX512-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
+; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
+; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512-NEXT: vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512-NEXT: vmovupd %zmm0, (%rdx)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%a2 = getelementptr inbounds i8, ptr %a0, i64 8
%b2 = getelementptr inbounds i8, ptr %b0, i64 8
%c2 = getelementptr inbounds i8, ptr %c0, i64 8
diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll
index fdf895921ca67..f787e79e6238a 100644
--- a/llvm/test/CodeGen/X86/widen_fdiv.ll
+++ b/llvm/test/CodeGen/X86/widen_fdiv.ll
@@ -162,27 +162,14 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512F-NEXT: vdivps %xmm4, %xmm2, %xmm2
; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX512F-NEXT: vdivps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm5, %xmm4, %xmm4
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm7, %xmm6, %xmm6
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm8, %xmm7, %xmm7
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,2,8,10,0,2,8,10]
+; AVX512F-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1
+; AVX512F-NEXT: vmovups (%rdi), %ymm0
+; AVX512F-NEXT: vdivps (%rsi), %ymm0, %ymm0
+; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll
index 16baa068fc24f..e1b3dbc6e2a38 100644
--- a/llvm/test/CodeGen/X86/widen_fmul.ll
+++ b/llvm/test/CodeGen/X86/widen_fmul.ll
@@ -187,84 +187,44 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
;
-; AVX512F-LABEL: widen_fmul_v2f32_v16f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm5, %xmm4, %xmm4
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm7, %xmm6, %xmm6
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512F-NEXT: vmulps %xmm7, %xmm8, %xmm7
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: widen_fmul_v2f32_v16f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm5, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm7, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512VL-NEXT: vmulps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
-; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
-; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: widen_fmul_v2f32_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmulps %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmulps %xmm4, %xmm1, %xmm1
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmulps %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmulps %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
+; AVX512-NEXT: vmulps %xmm5, %xmm4, %xmm4
+; AVX512-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
+; AVX512-NEXT: vmulps %xmm6, %xmm5, %xmm5
+; AVX512-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
+; AVX512-NEXT: vmulps %xmm7, %xmm6, %xmm6
+; AVX512-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
+; AVX512-NEXT: vmulps %xmm7, %xmm8, %xmm7
+; AVX512-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
+; AVX512-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
+; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
+; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512-NEXT: vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512-NEXT: vmovupd %zmm0, (%rdx)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%a2 = getelementptr inbounds i8, ptr %a0, i64 8
%b2 = getelementptr inbounds i8, ptr %b0, i64 8
%c2 = getelementptr inbounds i8, ptr %c0, i64 8
diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll
index 8dcd887ab4144..ac9c89a158cf6 100644
--- a/llvm/test/CodeGen/X86/widen_fsub.ll
+++ b/llvm/test/CodeGen/X86/widen_fsub.ll
@@ -187,84 +187,44 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
;
-; AVX512F-LABEL: widen_fsub_v2f32_v16f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm5, %xmm4, %xmm4
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm6, %xmm5, %xmm5
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm7, %xmm6, %xmm6
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512F-NEXT: vsubps %xmm8, %xmm7, %xmm7
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: widen_fsub_v2f32_v16f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm5, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm6, %xmm5, %xmm5
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm7, %xmm6, %xmm6
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512VL-NEXT: vsubps %xmm8, %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
-; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
-; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
+; AVX512-LABEL: widen_fsub_v2f32_v16f32:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vsubps %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vsubps %xmm4, %xmm1, %xmm1
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vsubps %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vsubps %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
+; AVX512-NEXT: vsubps %xmm5, %xmm4, %xmm4
+; AVX512-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
+; AVX512-NEXT: vsubps %xmm6, %xmm5, %xmm5
+; AVX512-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
+; AVX512-NEXT: vsubps %xmm7, %xmm6, %xmm6
+; AVX512-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
+; AVX512-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
+; AVX512-NEXT: vsubps %xmm8, %xmm7, %xmm7
+; AVX512-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
+; AVX512-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
+; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
+; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512-NEXT: vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
+; AVX512-NEXT: vmovupd %zmm0, (%rdx)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%a2 = getelementptr inbounds i8, ptr %a0, i64 8
%b2 = getelementptr inbounds i8, ptr %b0, i64 8
%c2 = getelementptr inbounds i8, ptr %c0, i64 8
>From 9315c33a4e5af0f564ae80164001fabda4142b87 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Mon, 1 Jun 2026 23:45:33 +0000
Subject: [PATCH 2/2] clang-format, use better API
---
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 58fc5ece9f3d3..6dec3ddf24bd4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -27583,9 +27583,8 @@ static SDValue foldExtractSubvectorFromConcatVectors(EVT NVT, SDValue V,
return SDValue();
assert(NewExtIdx % ExtNumElts == 0 &&
"Extract index is not a multiple of the input vector length.");
- SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
- V.getOperand(ConcatOpIdx), NewIndexC);
+ return DAG.getExtractSubvector(DL, NVT, V.getOperand(ConcatOpIdx),
+ NewExtIdx);
}
// If the extract covers multiple whole concat operands, rebuild that smaller
@@ -27717,9 +27716,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
}
}
- if (SDValue Folded =
- foldExtractSubvectorFromConcatVectors(NVT, V, ExtIdx, DL, DAG,
- LegalOperations))
+ if (SDValue Folded = foldExtractSubvectorFromConcatVectors(
+ NVT, V, ExtIdx, DL, DAG, LegalOperations))
return Folded;
if (SDValue Shuffle = foldExtractSubvectorFromShuffleVector(
More information about the llvm-branch-commits
mailing list