[llvm] [DAG] SelectionDAGBuilder::visitShuffleVector - split shuffle(concat(x,y),undef,mask) -> shuffle(x,y,mask) patterns (PR #180573)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Feb 9 09:48:29 PST 2026


https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/180573

If an unary shuffle sources are twice the width of the result, then split the lower/upper subvectors and perform a binary shuffle instead of scalarizing

I'm still cautious here as shuffle legalisation of illegal types can be tricky - I also didn't canonicalise any shuffle masks that reference the undef second source.

Fixes #88030

>From 967ce35fd5b75fb3d6451549af183c7d82f900e9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 9 Feb 2026 17:46:25 +0000
Subject: [PATCH] [DAG] SelectionDAGBuilder::visitShuffleVector - split
 shuffle(concat(x,y),undef,mask) -> shuffle(x,y,mask) patterns

If an unary shuffle sources are twice the width of the result, then split the lower/upper subvectors and perform a binary shuffle instead of scalarizing

I'm still cautious here as shuffle legalisation of illegal types can be tricky - I also didn't canonicalise any shuffle masks that reference the undef second source.

Fixes #88030
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  12 +
 ...ffer-fat-pointers-contents-legalization.ll |  44 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 288 +++++----
 ....amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll | 186 +++---
 ...m.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll | 239 ++++----
 .../AMDGPU/load-local-redundant-copies.ll     |  33 +-
 llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll | 554 +++---------------
 llvm/test/CodeGen/X86/vector-narrow-binop.ll  |   4 +-
 llvm/test/CodeGen/X86/vselect-avx.ll          |  10 +-
 9 files changed, 474 insertions(+), 896 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 7c762ed6d91ce..8223af0561482 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4277,6 +4277,18 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
 
   assert(SrcNumElts > MaskNumElts);
 
+  // See if we can recreate a binary shuffle by splitting an unary shuffle with
+  // sources twice the destination size.
+  if (SrcNumElts == (MaskNumElts * 2) && Src2.isUndef() &&
+      TLI.isTypeLegal(VT) &&
+      all_of(Mask, [MaskNumElts](int M) { return M < (int)MaskNumElts; })) {
+    SDValue LHS = DAG.getExtractSubvector(DL, VT, Src1, 0);
+    SDValue RHS = DAG.getExtractSubvector(DL, VT, Src1, MaskNumElts);
+    SDValue Result = DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
+    setValue(&I, Result);
+    return;
+  }
+
   // Analyze the access pattern of the vector to see if we can extract
   // two subvectors and do the shuffle.
   int StartIdx[2] = {-1, -1}; // StartIdx to extract from
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 5967d17c351ea..4d787637f0770 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -2872,34 +2872,34 @@ define void @store_v32i8(<32 x i8> %data, ptr addrspace(8) inreg %buf) {
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-NEXT:    s_mov_b32 s4, 0xc0c0004
-; SDAG-NEXT:    v_perm_b32 v8, v8, v9, s4
-; SDAG-NEXT:    v_perm_b32 v9, v10, v11, s4
-; SDAG-NEXT:    buffer_load_ubyte v10, off, s[0:3], s32
+; SDAG-NEXT:    v_perm_b32 v20, v20, v21, s4
+; SDAG-NEXT:    v_perm_b32 v21, v22, v23, s4
+; SDAG-NEXT:    buffer_load_ubyte v22, off, s[0:3], s32
 ; SDAG-NEXT:    v_perm_b32 v12, v12, v13, s4
 ; SDAG-NEXT:    v_perm_b32 v13, v14, v15, s4
+; SDAG-NEXT:    v_perm_b32 v8, v8, v9, s4
+; SDAG-NEXT:    v_perm_b32 v9, v10, v11, s4
 ; SDAG-NEXT:    v_perm_b32 v4, v4, v5, s4
 ; SDAG-NEXT:    v_perm_b32 v5, v6, v7, s4
-; SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
-; SDAG-NEXT:    v_perm_b32 v6, v2, v3, s4
-; SDAG-NEXT:    v_lshl_or_b32 v3, v13, 16, v12
-; SDAG-NEXT:    v_lshl_or_b32 v2, v9, 16, v8
-; SDAG-NEXT:    v_lshl_or_b32 v1, v5, 16, v4
-; SDAG-NEXT:    v_lshl_or_b32 v0, v6, 16, v0
-; SDAG-NEXT:    v_perm_b32 v7, v28, v29, s4
-; SDAG-NEXT:    v_perm_b32 v11, v24, v25, s4
-; SDAG-NEXT:    v_perm_b32 v14, v26, v27, s4
-; SDAG-NEXT:    v_perm_b32 v15, v20, v21, s4
-; SDAG-NEXT:    v_perm_b32 v20, v22, v23, s4
+; SDAG-NEXT:    v_perm_b32 v10, v0, v1, s4
+; SDAG-NEXT:    v_perm_b32 v3, v2, v3, s4
+; SDAG-NEXT:    v_perm_b32 v24, v24, v25, s4
+; SDAG-NEXT:    v_perm_b32 v25, v26, v27, s4
+; SDAG-NEXT:    v_perm_b32 v23, v28, v29, s4
 ; SDAG-NEXT:    v_perm_b32 v16, v16, v17, s4
 ; SDAG-NEXT:    v_perm_b32 v17, v18, v19, s4
-; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
-; SDAG-NEXT:    v_lshl_or_b32 v5, v14, 16, v11
-; SDAG-NEXT:    v_lshl_or_b32 v4, v20, 16, v15
-; SDAG-NEXT:    v_lshl_or_b32 v3, v17, 16, v16
-; SDAG-NEXT:    s_waitcnt vmcnt(1)
-; SDAG-NEXT:    v_perm_b32 v0, v30, v10, s4
-; SDAG-NEXT:    v_lshl_or_b32 v6, v0, 16, v7
-; SDAG-NEXT:    buffer_store_dwordx4 v[3:6], off, s[16:19], 0 offset:16
+; SDAG-NEXT:    v_lshl_or_b32 v7, v13, 16, v12
+; SDAG-NEXT:    v_lshl_or_b32 v6, v9, 16, v8
+; SDAG-NEXT:    v_lshl_or_b32 v5, v5, 16, v4
+; SDAG-NEXT:    v_lshl_or_b32 v4, v3, 16, v10
+; SDAG-NEXT:    v_lshl_or_b32 v2, v25, 16, v24
+; SDAG-NEXT:    v_lshl_or_b32 v1, v21, 16, v20
+; SDAG-NEXT:    v_lshl_or_b32 v0, v17, 16, v16
+; SDAG-NEXT:    s_waitcnt vmcnt(0)
+; SDAG-NEXT:    v_perm_b32 v3, v30, v22, s4
+; SDAG-NEXT:    v_lshl_or_b32 v3, v3, 16, v23
+; SDAG-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index a1fe463de1c54..4927b8110e5cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -2859,15 +2859,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; SDAG-NEXT:    v_mov_b64_e32 v[32:33], 48
 ; SDAG-NEXT:    v_mov_b64_e32 v[34:35], 32
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s24
-; SDAG-NEXT:    v_mov_b32_e32 v37, s25
-; SDAG-NEXT:    v_mov_b32_e32 v38, s26
-; SDAG-NEXT:    v_mov_b32_e32 v39, s27
+; SDAG-NEXT:    v_mov_b32_e32 v40, s24
+; SDAG-NEXT:    v_mov_b32_e32 v41, s25
+; SDAG-NEXT:    v_mov_b32_e32 v42, s26
+; SDAG-NEXT:    v_mov_b32_e32 v43, s27
 ; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT:    v_mov_b32_e32 v40, s28
-; SDAG-NEXT:    v_mov_b32_e32 v41, s29
-; SDAG-NEXT:    v_mov_b32_e32 v42, s30
-; SDAG-NEXT:    v_mov_b32_e32 v43, s31
+; SDAG-NEXT:    v_mov_b32_e32 v36, s28
+; SDAG-NEXT:    v_mov_b32_e32 v37, s29
+; SDAG-NEXT:    v_mov_b32_e32 v38, s30
+; SDAG-NEXT:    v_mov_b32_e32 v39, s31
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -2876,7 +2876,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[40:43], v[36:39], v[16:31]
 ; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -2978,15 +2978,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], 48
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], 32
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
-; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
-; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
-; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s27
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT:    v_mov_b32_e32 v8, s28
-; HEURRC-NEXT:    v_mov_b32_e32 v9, s29
-; HEURRC-NEXT:    v_mov_b32_e32 v10, s30
-; HEURRC-NEXT:    v_mov_b32_e32 v11, s31
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s31
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
@@ -3003,7 +3003,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31]
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[8:11], v[4:7], a[16:31]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
@@ -3049,15 +3049,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], 48
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], 32
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b32_e32 v36, s24
-; VGPRRC-NEXT:    v_mov_b32_e32 v37, s25
-; VGPRRC-NEXT:    v_mov_b32_e32 v38, s26
-; VGPRRC-NEXT:    v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v43, s27
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; VGPRRC-NEXT:    v_mov_b32_e32 v40, s28
-; VGPRRC-NEXT:    v_mov_b32_e32 v41, s29
-; VGPRRC-NEXT:    v_mov_b32_e32 v42, s30
-; VGPRRC-NEXT:    v_mov_b32_e32 v43, s31
+; VGPRRC-NEXT:    v_mov_b32_e32 v36, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v37, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v38, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v39, s31
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -3066,7 +3066,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31]
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[40:43], v[36:39], v[16:31]
 ; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -3260,15 +3260,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[32:33], 48
 ; SDAG-NEXT:    v_mov_b64_e32 v[34:35], 32
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v36, s24
-; SDAG-NEXT:    v_mov_b32_e32 v37, s25
-; SDAG-NEXT:    v_mov_b32_e32 v38, s26
-; SDAG-NEXT:    v_mov_b32_e32 v39, s27
+; SDAG-NEXT:    v_mov_b32_e32 v40, s24
+; SDAG-NEXT:    v_mov_b32_e32 v41, s25
+; SDAG-NEXT:    v_mov_b32_e32 v42, s26
+; SDAG-NEXT:    v_mov_b32_e32 v43, s27
 ; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; SDAG-NEXT:    v_mov_b32_e32 v40, s28
-; SDAG-NEXT:    v_mov_b32_e32 v41, s29
-; SDAG-NEXT:    v_mov_b32_e32 v42, s30
-; SDAG-NEXT:    v_mov_b32_e32 v43, s31
+; SDAG-NEXT:    v_mov_b32_e32 v36, s28
+; SDAG-NEXT:    v_mov_b32_e32 v37, s29
+; SDAG-NEXT:    v_mov_b32_e32 v38, s30
+; SDAG-NEXT:    v_mov_b32_e32 v39, s31
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -3277,7 +3277,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[40:43], v[36:39], v[16:31] cbsz:2 abid:3 blgp:1
 ; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
@@ -3379,15 +3379,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], 48
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], 32
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v4, s24
-; HEURRC-NEXT:    v_mov_b32_e32 v5, s25
-; HEURRC-NEXT:    v_mov_b32_e32 v6, s26
-; HEURRC-NEXT:    v_mov_b32_e32 v7, s27
+; HEURRC-NEXT:    v_mov_b32_e32 v8, s24
+; HEURRC-NEXT:    v_mov_b32_e32 v9, s25
+; HEURRC-NEXT:    v_mov_b32_e32 v10, s26
+; HEURRC-NEXT:    v_mov_b32_e32 v11, s27
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a31, s23
-; HEURRC-NEXT:    v_mov_b32_e32 v8, s28
-; HEURRC-NEXT:    v_mov_b32_e32 v9, s29
-; HEURRC-NEXT:    v_mov_b32_e32 v10, s30
-; HEURRC-NEXT:    v_mov_b32_e32 v11, s31
+; HEURRC-NEXT:    v_mov_b32_e32 v4, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v5, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v6, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v7, s31
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a30, s22
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a29, s21
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a28, s20
@@ -3404,7 +3404,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a17, s9
 ; HEURRC-NEXT:    v_accvgpr_write_b32 a16, s8
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[4:7], v[8:11], a[16:31] cbsz:2 abid:3 blgp:1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 a[0:15], v[8:11], v[4:7], a[16:31] cbsz:2 abid:3 blgp:1
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], 16
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], 0
 ; HEURRC-NEXT:    v_mov_b32_e32 v8, s16
@@ -3450,15 +3450,15 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[32:33], 48
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[34:35], 32
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b32_e32 v36, s24
-; VGPRRC-NEXT:    v_mov_b32_e32 v37, s25
-; VGPRRC-NEXT:    v_mov_b32_e32 v38, s26
-; VGPRRC-NEXT:    v_mov_b32_e32 v39, s27
+; VGPRRC-NEXT:    v_mov_b32_e32 v40, s24
+; VGPRRC-NEXT:    v_mov_b32_e32 v41, s25
+; VGPRRC-NEXT:    v_mov_b32_e32 v42, s26
+; VGPRRC-NEXT:    v_mov_b32_e32 v43, s27
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
-; VGPRRC-NEXT:    v_mov_b32_e32 v40, s28
-; VGPRRC-NEXT:    v_mov_b32_e32 v41, s29
-; VGPRRC-NEXT:    v_mov_b32_e32 v42, s30
-; VGPRRC-NEXT:    v_mov_b32_e32 v43, s31
+; VGPRRC-NEXT:    v_mov_b32_e32 v36, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v37, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v38, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v39, s31
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -3467,7 +3467,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[40:43], v[16:31] cbsz:2 abid:3 blgp:1
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[40:43], v[36:39], v[16:31] cbsz:2 abid:3 blgp:1
 ; VGPRRC-NEXT:    s_nop 11
 ; VGPRRC-NEXT:    global_store_dwordx4 v[32:33], v[12:15], off sc0 sc1
 ; VGPRRC-NEXT:    s_waitcnt vmcnt(0)
@@ -3962,21 +3962,20 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i
 define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 {
 ; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; SDAG-NEXT:    v_mov_b32_e32 v40, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    v_mov_b32_e32 v36, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s27
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b32_e32 v32, s28
+; SDAG-NEXT:    v_mov_b32_e32 v33, s29
+; SDAG-NEXT:    v_mov_b32_e32 v34, s30
+; SDAG-NEXT:    v_mov_b32_e32 v35, s31
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -3985,7 +3984,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[32:35], v[16:31]
 ; SDAG-NEXT:    s_nop 6
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s21
@@ -4072,21 +4071,20 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ;
 ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
 ; HEURRC:       ; %bb.0:
-; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; HEURRC-NEXT:    v_mov_b32_e32 v40, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
-; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    v_mov_b32_e32 v36, s24
 ; HEURRC-NEXT:    v_mov_b32_e32 v37, s25
 ; HEURRC-NEXT:    v_mov_b32_e32 v38, s26
 ; HEURRC-NEXT:    v_mov_b32_e32 v39, s27
-; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s31
 ; HEURRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -4095,7 +4093,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[32:35], v[16:31]
 ; HEURRC-NEXT:    s_nop 6
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
@@ -4136,21 +4134,20 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ;
 ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd:
 ; VGPRRC:       ; %bb.0:
-; VGPRRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; VGPRRC-NEXT:    v_mov_b32_e32 v40, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
-; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    v_mov_b32_e32 v36, s24
 ; VGPRRC-NEXT:    v_mov_b32_e32 v37, s25
 ; VGPRRC-NEXT:    v_mov_b32_e32 v38, s26
 ; VGPRRC-NEXT:    v_mov_b32_e32 v39, s27
-; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s31
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -4159,7 +4156,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31]
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[32:35], v[16:31]
 ; VGPRRC-NEXT:    s_nop 6
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
 ; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
@@ -4339,21 +4336,20 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4
 define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 {
 ; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; SDAG-NEXT:    v_mov_b32_e32 v40, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v32, s20
-; SDAG-NEXT:    v_mov_b32_e32 v33, s21
-; SDAG-NEXT:    v_mov_b32_e32 v34, s22
-; SDAG-NEXT:    v_mov_b32_e32 v35, s23
-; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    v_mov_b32_e32 v36, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v37, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v38, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v39, s27
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b32_e32 v32, s28
+; SDAG-NEXT:    v_mov_b32_e32 v33, s29
+; SDAG-NEXT:    v_mov_b32_e32 v34, s30
+; SDAG-NEXT:    v_mov_b32_e32 v35, s31
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -4362,7 +4358,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[32:35], v[16:31] cbsz:1 abid:2 blgp:3
 ; SDAG-NEXT:    s_nop 6
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s21
@@ -4449,21 +4445,20 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ;
 ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
 ; HEURRC:       ; %bb.0:
-; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; HEURRC-NEXT:    v_mov_b32_e32 v40, 0
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v32, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v33, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v34, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v35, s23
-; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    v_mov_b32_e32 v36, s24
 ; HEURRC-NEXT:    v_mov_b32_e32 v37, s25
 ; HEURRC-NEXT:    v_mov_b32_e32 v38, s26
 ; HEURRC-NEXT:    v_mov_b32_e32 v39, s27
-; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; HEURRC-NEXT:    v_mov_b32_e32 v32, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v33, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v34, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v35, s31
 ; HEURRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -4472,7 +4467,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; HEURRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[32:35], v[16:31] cbsz:1 abid:2 blgp:3
 ; HEURRC-NEXT:    s_nop 6
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
 ; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
@@ -4513,21 +4508,20 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ;
 ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags:
 ; VGPRRC:       ; %bb.0:
-; VGPRRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; VGPRRC-NEXT:    v_mov_b32_e32 v40, 0
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b32_e32 v32, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v33, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v34, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v35, s23
-; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    v_mov_b32_e32 v36, s24
 ; VGPRRC-NEXT:    v_mov_b32_e32 v37, s25
 ; VGPRRC-NEXT:    v_mov_b32_e32 v38, s26
 ; VGPRRC-NEXT:    v_mov_b32_e32 v39, s27
-; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; VGPRRC-NEXT:    v_mov_b32_e32 v32, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v33, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v34, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v35, s31
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -4536,7 +4530,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[36:39], v[32:35], v[16:31] cbsz:1 abid:2 blgp:3
 ; VGPRRC-NEXT:    s_nop 6
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
 ; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
@@ -4716,20 +4710,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a
 define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 {
 ; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    v_mov_b32_e32 v20, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v22, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v23, s27
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v18, s30
+; SDAG-NEXT:    v_mov_b32_e32 v19, s31
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -4738,7 +4731,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[20:23], v[16:19], v[0:15]
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
@@ -4777,20 +4770,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ;
 ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
 ; HEURRC:       ; %bb.0:
-; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    v_mov_b32_e32 v20, s24
 ; HEURRC-NEXT:    v_mov_b32_e32 v21, s25
 ; HEURRC-NEXT:    v_mov_b32_e32 v22, s26
 ; HEURRC-NEXT:    v_mov_b32_e32 v23, s27
-; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s31
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -4799,7 +4791,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[20:23], v[16:19], v[0:15]
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
 ; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
@@ -4810,20 +4802,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ;
 ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac:
 ; VGPRRC:       ; %bb.0:
-; VGPRRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    v_mov_b32_e32 v20, s24
 ; VGPRRC-NEXT:    v_mov_b32_e32 v21, s25
 ; VGPRRC-NEXT:    v_mov_b32_e32 v22, s26
 ; VGPRRC-NEXT:    v_mov_b32_e32 v23, s27
-; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s31
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -4832,7 +4823,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15]
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[20:23], v[16:19], v[0:15]
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
 ; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
@@ -4922,20 +4913,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0
 define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, ptr addrspace(1) %out) #0 {
 ; SDAG-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s20
-; SDAG-NEXT:    v_mov_b32_e32 v17, s21
-; SDAG-NEXT:    v_mov_b32_e32 v18, s22
-; SDAG-NEXT:    v_mov_b32_e32 v19, s23
-; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; SDAG-NEXT:    v_mov_b32_e32 v20, s24
 ; SDAG-NEXT:    v_mov_b32_e32 v21, s25
 ; SDAG-NEXT:    v_mov_b32_e32 v22, s26
 ; SDAG-NEXT:    v_mov_b32_e32 v23, s27
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v16, s28
+; SDAG-NEXT:    v_mov_b32_e32 v17, s29
+; SDAG-NEXT:    v_mov_b32_e32 v18, s30
+; SDAG-NEXT:    v_mov_b32_e32 v19, s31
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -4944,7 +4934,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; SDAG-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[20:23], v[16:19], v[0:15] cbsz:3 abid:2 blgp:1
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 10
 ; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
@@ -4983,20 +4973,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ;
 ; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
 ; HEURRC:       ; %bb.0:
-; HEURRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
-; HEURRC-NEXT:    v_mov_b32_e32 v16, s20
-; HEURRC-NEXT:    v_mov_b32_e32 v17, s21
-; HEURRC-NEXT:    v_mov_b32_e32 v18, s22
-; HEURRC-NEXT:    v_mov_b32_e32 v19, s23
-; HEURRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; HEURRC-NEXT:    v_mov_b32_e32 v20, s24
 ; HEURRC-NEXT:    v_mov_b32_e32 v21, s25
 ; HEURRC-NEXT:    v_mov_b32_e32 v22, s26
 ; HEURRC-NEXT:    v_mov_b32_e32 v23, s27
-; HEURRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; HEURRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; HEURRC-NEXT:    v_mov_b32_e32 v16, s28
+; HEURRC-NEXT:    v_mov_b32_e32 v17, s29
+; HEURRC-NEXT:    v_mov_b32_e32 v18, s30
+; HEURRC-NEXT:    v_mov_b32_e32 v19, s31
 ; HEURRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -5005,7 +4994,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; HEURRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; HEURRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; HEURRC-NEXT:    s_nop 1
-; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; HEURRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[20:23], v[16:19], v[0:15] cbsz:3 abid:2 blgp:1
 ; HEURRC-NEXT:    v_mov_b32_e32 v16, 0
 ; HEURRC-NEXT:    s_nop 10
 ; HEURRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
@@ -5016,20 +5005,19 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ;
 ; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags:
 ; VGPRRC:       ; %bb.0:
-; VGPRRC-NEXT:    s_load_dwordx8 s[20:27], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x24
+; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xa4
 ; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
-; VGPRRC-NEXT:    v_mov_b32_e32 v16, s20
-; VGPRRC-NEXT:    v_mov_b32_e32 v17, s21
-; VGPRRC-NEXT:    v_mov_b32_e32 v18, s22
-; VGPRRC-NEXT:    v_mov_b32_e32 v19, s23
-; VGPRRC-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
 ; VGPRRC-NEXT:    v_mov_b32_e32 v20, s24
 ; VGPRRC-NEXT:    v_mov_b32_e32 v21, s25
 ; VGPRRC-NEXT:    v_mov_b32_e32 v22, s26
 ; VGPRRC-NEXT:    v_mov_b32_e32 v23, s27
-; VGPRRC-NEXT:    s_waitcnt lgkmcnt(0)
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; VGPRRC-NEXT:    v_mov_b32_e32 v16, s28
+; VGPRRC-NEXT:    v_mov_b32_e32 v17, s29
+; VGPRRC-NEXT:    v_mov_b32_e32 v18, s30
+; VGPRRC-NEXT:    v_mov_b32_e32 v19, s31
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -5038,7 +5026,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; VGPRRC-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; VGPRRC-NEXT:    s_nop 1
-; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1
+; VGPRRC-NEXT:    v_mfma_i32_32x32x32_i8 v[0:15], v[20:23], v[16:19], v[0:15] cbsz:3 abid:2 blgp:1
 ; VGPRRC-NEXT:    v_mov_b32_e32 v16, 0
 ; VGPRRC-NEXT:    s_nop 10
 ; VGPRRC-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
index d9359c056e9d6..719838a01aa50 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll
@@ -1147,34 +1147,34 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd(<8 x i32
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; SDAG-NEXT:    v_mov_b32_e32 v6, s14
-; SDAG-NEXT:    v_mov_b32_e32 v7, s15
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v8, s16
-; SDAG-NEXT:    v_mov_b32_e32 v9, s17
-; SDAG-NEXT:    v_mov_b32_e32 v10, s18
-; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    v_mov_b32_e32 v12, s20
-; SDAG-NEXT:    v_mov_b32_e32 v13, s21
-; SDAG-NEXT:    v_mov_b32_e32 v14, s22
-; SDAG-NEXT:    v_mov_b32_e32 v15, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
+; SDAG-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x40
+; SDAG-NEXT:    v_mov_b32_e32 v8, s8
+; SDAG-NEXT:    v_mov_b32_e32 v9, s9
+; SDAG-NEXT:    v_mov_b32_e32 v10, s10
+; SDAG-NEXT:    v_mov_b32_e32 v11, s11
+; SDAG-NEXT:    v_mov_b32_e32 v12, s12
+; SDAG-NEXT:    v_mov_b32_e32 v13, s13
+; SDAG-NEXT:    v_mov_b32_e32 v14, s14
+; SDAG-NEXT:    v_mov_b32_e32 v15, s15
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    v_mov_b32_e32 v21, s12
-; SDAG-NEXT:    v_mov_b32_e32 v22, s13
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    v_mov_b32_e32 v21, s20
+; SDAG-NEXT:    v_mov_b32_e32 v22, s21
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[8:15], v[0:7], v[16:19], v21, v22 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; SDAG-NEXT:    s_nop 11
-; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[14:15]
+; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[22:23]
 ; SDAG-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd:
@@ -1215,26 +1215,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v22, 0x41
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; SDAG-NEXT:    v_mov_b32_e32 v6, s14
-; SDAG-NEXT:    v_mov_b32_e32 v7, s15
+; SDAG-NEXT:    v_mov_b32_e32 v8, s8
+; SDAG-NEXT:    v_mov_b32_e32 v9, s9
+; SDAG-NEXT:    v_mov_b32_e32 v10, s10
+; SDAG-NEXT:    v_mov_b32_e32 v11, s11
+; SDAG-NEXT:    v_mov_b32_e32 v12, s12
+; SDAG-NEXT:    v_mov_b32_e32 v13, s13
+; SDAG-NEXT:    v_mov_b32_e32 v14, s14
+; SDAG-NEXT:    v_mov_b32_e32 v15, s15
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
-; SDAG-NEXT:    v_mov_b32_e32 v8, s16
-; SDAG-NEXT:    v_mov_b32_e32 v9, s17
-; SDAG-NEXT:    v_mov_b32_e32 v10, s18
-; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    v_mov_b32_e32 v12, s20
-; SDAG-NEXT:    v_mov_b32_e32 v13, s21
-; SDAG-NEXT:    v_mov_b32_e32 v14, s22
-; SDAG-NEXT:    v_mov_b32_e32 v15, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[8:15], v[0:7], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
@@ -1278,26 +1278,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v22, 0x41
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; SDAG-NEXT:    v_mov_b32_e32 v6, s14
-; SDAG-NEXT:    v_mov_b32_e32 v7, s15
+; SDAG-NEXT:    v_mov_b32_e32 v8, s8
+; SDAG-NEXT:    v_mov_b32_e32 v9, s9
+; SDAG-NEXT:    v_mov_b32_e32 v10, s10
+; SDAG-NEXT:    v_mov_b32_e32 v11, s11
+; SDAG-NEXT:    v_mov_b32_e32 v12, s12
+; SDAG-NEXT:    v_mov_b32_e32 v13, s13
+; SDAG-NEXT:    v_mov_b32_e32 v14, s14
+; SDAG-NEXT:    v_mov_b32_e32 v15, s15
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
-; SDAG-NEXT:    v_mov_b32_e32 v8, s16
-; SDAG-NEXT:    v_mov_b32_e32 v9, s17
-; SDAG-NEXT:    v_mov_b32_e32 v10, s18
-; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    v_mov_b32_e32 v12, s20
-; SDAG-NEXT:    v_mov_b32_e32 v13, s21
-; SDAG-NEXT:    v_mov_b32_e32 v14, s22
-; SDAG-NEXT:    v_mov_b32_e32 v15, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[8:15], v[0:7], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
@@ -1341,26 +1341,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v22, 1.0
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; SDAG-NEXT:    v_mov_b32_e32 v6, s14
-; SDAG-NEXT:    v_mov_b32_e32 v7, s15
+; SDAG-NEXT:    v_mov_b32_e32 v8, s8
+; SDAG-NEXT:    v_mov_b32_e32 v9, s9
+; SDAG-NEXT:    v_mov_b32_e32 v10, s10
+; SDAG-NEXT:    v_mov_b32_e32 v11, s11
+; SDAG-NEXT:    v_mov_b32_e32 v12, s12
+; SDAG-NEXT:    v_mov_b32_e32 v13, s13
+; SDAG-NEXT:    v_mov_b32_e32 v14, s14
+; SDAG-NEXT:    v_mov_b32_e32 v15, s15
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
-; SDAG-NEXT:    v_mov_b32_e32 v8, s16
-; SDAG-NEXT:    v_mov_b32_e32 v9, s17
-; SDAG-NEXT:    v_mov_b32_e32 v10, s18
-; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    v_mov_b32_e32 v12, s20
-; SDAG-NEXT:    v_mov_b32_e32 v13, s21
-; SDAG-NEXT:    v_mov_b32_e32 v14, s22
-; SDAG-NEXT:    v_mov_b32_e32 v15, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[8:15], v[0:7], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
@@ -1404,26 +1404,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_16x16x128_f8f6f4__vgprcd___scaleA
 ; SDAG-NEXT:    v_mov_b32_e32 v22, 1.0
 ; SDAG-NEXT:    v_mov_b32_e32 v20, 0
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v0, s8
-; SDAG-NEXT:    v_mov_b32_e32 v1, s9
-; SDAG-NEXT:    v_mov_b32_e32 v2, s10
-; SDAG-NEXT:    v_mov_b32_e32 v3, s11
-; SDAG-NEXT:    v_mov_b32_e32 v4, s12
-; SDAG-NEXT:    v_mov_b32_e32 v5, s13
-; SDAG-NEXT:    v_mov_b32_e32 v6, s14
-; SDAG-NEXT:    v_mov_b32_e32 v7, s15
+; SDAG-NEXT:    v_mov_b32_e32 v8, s8
+; SDAG-NEXT:    v_mov_b32_e32 v9, s9
+; SDAG-NEXT:    v_mov_b32_e32 v10, s10
+; SDAG-NEXT:    v_mov_b32_e32 v11, s11
+; SDAG-NEXT:    v_mov_b32_e32 v12, s12
+; SDAG-NEXT:    v_mov_b32_e32 v13, s13
+; SDAG-NEXT:    v_mov_b32_e32 v14, s14
+; SDAG-NEXT:    v_mov_b32_e32 v15, s15
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[2:3]
-; SDAG-NEXT:    v_mov_b32_e32 v8, s16
-; SDAG-NEXT:    v_mov_b32_e32 v9, s17
-; SDAG-NEXT:    v_mov_b32_e32 v10, s18
-; SDAG-NEXT:    v_mov_b32_e32 v11, s19
-; SDAG-NEXT:    v_mov_b32_e32 v12, s20
-; SDAG-NEXT:    v_mov_b32_e32 v13, s21
-; SDAG-NEXT:    v_mov_b32_e32 v14, s22
-; SDAG-NEXT:    v_mov_b32_e32 v15, s23
+; SDAG-NEXT:    v_mov_b32_e32 v0, s16
+; SDAG-NEXT:    v_mov_b32_e32 v1, s17
+; SDAG-NEXT:    v_mov_b32_e32 v2, s18
+; SDAG-NEXT:    v_mov_b32_e32 v3, s19
+; SDAG-NEXT:    v_mov_b32_e32 v4, s20
+; SDAG-NEXT:    v_mov_b32_e32 v5, s21
+; SDAG-NEXT:    v_mov_b32_e32 v6, s22
+; SDAG-NEXT:    v_mov_b32_e32 v7, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[0:1]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[0:7], v[8:15], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_16x16x128_f8f6f4 v[0:3], v[8:15], v[0:7], v[16:19], v22, v21 op_sel:[1,1,0] op_sel_hi:[1,0,0]
 ; SDAG-NEXT:    s_nop 11
 ; SDAG-NEXT:    global_store_dwordx4 v20, v[0:3], s[6:7]
 ; SDAG-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
index 978284e1fb1da..e48812a498e9d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll
@@ -3136,22 +3136,22 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
 ; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x80
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    v_mov_b32_e32 v20, s12
-; SDAG-NEXT:    v_mov_b32_e32 v21, s13
-; SDAG-NEXT:    v_mov_b32_e32 v22, s14
-; SDAG-NEXT:    v_mov_b32_e32 v23, s15
-; SDAG-NEXT:    v_mov_b32_e32 v24, s16
-; SDAG-NEXT:    v_mov_b32_e32 v25, s17
-; SDAG-NEXT:    v_mov_b32_e32 v26, s18
-; SDAG-NEXT:    v_mov_b32_e32 v27, s19
-; SDAG-NEXT:    v_mov_b32_e32 v28, s20
-; SDAG-NEXT:    v_mov_b32_e32 v29, s21
-; SDAG-NEXT:    v_mov_b32_e32 v30, s22
-; SDAG-NEXT:    v_mov_b32_e32 v31, s23
+; SDAG-NEXT:    v_mov_b32_e32 v24, s8
+; SDAG-NEXT:    v_mov_b32_e32 v25, s9
+; SDAG-NEXT:    v_mov_b32_e32 v26, s10
+; SDAG-NEXT:    v_mov_b32_e32 v27, s11
+; SDAG-NEXT:    v_mov_b32_e32 v28, s12
+; SDAG-NEXT:    v_mov_b32_e32 v29, s13
+; SDAG-NEXT:    v_mov_b32_e32 v30, s14
+; SDAG-NEXT:    v_mov_b32_e32 v31, s15
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; SDAG-NEXT:    v_mov_b32_e32 v22, s22
+; SDAG-NEXT:    v_mov_b32_e32 v23, s23
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
 ; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
@@ -3162,7 +3162,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd(<8 x i32>
 ; SDAG-NEXT:    v_mov_b32_e32 v32, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v33, s1
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[24:31], v[16:23], v[0:15], v32, v33 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 2
@@ -3215,37 +3215,38 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_kimm__scaleB__inlineimm:
 ; SDAG:       ; %bb.0:
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
-; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x40
 ; SDAG-NEXT:    v_mov_b32_e32 v32, -2
 ; SDAG-NEXT:    v_mov_b32_e32 v33, 0x41
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x80
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s8
-; SDAG-NEXT:    v_mov_b32_e32 v17, s9
-; SDAG-NEXT:    v_mov_b32_e32 v18, s10
-; SDAG-NEXT:    v_mov_b32_e32 v19, s11
-; SDAG-NEXT:    v_mov_b32_e32 v20, s12
-; SDAG-NEXT:    v_mov_b32_e32 v21, s13
-; SDAG-NEXT:    v_mov_b32_e32 v22, s14
-; SDAG-NEXT:    v_mov_b32_e32 v23, s15
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
-; SDAG-NEXT:    v_mov_b32_e32 v24, s16
-; SDAG-NEXT:    v_mov_b32_e32 v25, s17
-; SDAG-NEXT:    v_mov_b32_e32 v26, s18
-; SDAG-NEXT:    v_mov_b32_e32 v27, s19
-; SDAG-NEXT:    v_mov_b32_e32 v28, s20
-; SDAG-NEXT:    v_mov_b32_e32 v29, s21
-; SDAG-NEXT:    v_mov_b32_e32 v30, s22
-; SDAG-NEXT:    v_mov_b32_e32 v31, s23
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[38:39]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[40:41]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[42:43]
-; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[44:45]
-; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[46:47]
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[48:49]
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[50:51]
+; SDAG-NEXT:    v_mov_b32_e32 v16, s16
+; SDAG-NEXT:    v_mov_b32_e32 v17, s17
+; SDAG-NEXT:    v_mov_b32_e32 v18, s18
+; SDAG-NEXT:    v_mov_b32_e32 v19, s19
+; SDAG-NEXT:    v_mov_b32_e32 v20, s20
+; SDAG-NEXT:    v_mov_b32_e32 v21, s21
+; SDAG-NEXT:    v_mov_b32_e32 v22, s22
+; SDAG-NEXT:    v_mov_b32_e32 v23, s23
+; SDAG-NEXT:    s_load_dwordx16 s[16:31], s[4:5], 0x40
+; SDAG-NEXT:    v_mov_b32_e32 v24, s8
+; SDAG-NEXT:    v_mov_b32_e32 v25, s9
+; SDAG-NEXT:    v_mov_b32_e32 v26, s10
+; SDAG-NEXT:    v_mov_b32_e32 v27, s11
+; SDAG-NEXT:    v_mov_b32_e32 v28, s12
+; SDAG-NEXT:    v_mov_b32_e32 v29, s13
+; SDAG-NEXT:    v_mov_b32_e32 v30, s14
+; SDAG-NEXT:    v_mov_b32_e32 v31, s15
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[18:19]
+; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[20:21]
+; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[22:23]
+; SDAG-NEXT:    v_mov_b64_e32 v[8:9], s[24:25]
+; SDAG-NEXT:    v_mov_b64_e32 v[10:11], s[26:27]
+; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[28:29]
+; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[30:31]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[24:31], v[16:23], v[0:15], v33, v32 op_sel:[1,1,0] op_sel_hi:[1,0,0] blgp:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 0
 ; SDAG-NEXT:    s_nop 15
 ; SDAG-NEXT:    s_nop 2
@@ -3297,28 +3298,27 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4__vgprcd___scaleA_
 define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2, i32 %scale0, i32 %scale1) #1 {
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v18, s12
-; SDAG-NEXT:    v_mov_b32_e32 v19, s13
-; SDAG-NEXT:    v_mov_b32_e32 v20, s14
-; SDAG-NEXT:    v_mov_b32_e32 v21, s15
-; SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; SDAG-NEXT:    v_mov_b32_e32 v24, s18
-; SDAG-NEXT:    v_mov_b32_e32 v25, s19
-; SDAG-NEXT:    v_mov_b32_e32 v26, s20
-; SDAG-NEXT:    v_mov_b32_e32 v27, s21
-; SDAG-NEXT:    v_mov_b32_e32 v28, s22
-; SDAG-NEXT:    v_mov_b32_e32 v29, s23
+; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x80
-; SDAG-NEXT:    v_mov_b32_e32 v30, s24
-; SDAG-NEXT:    v_mov_b32_e32 v31, s25
-; SDAG-NEXT:    v_mov_b32_e32 v32, s26
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b32_e32 v33, s27
+; SDAG-NEXT:    v_mov_b32_e32 v26, s36
+; SDAG-NEXT:    v_mov_b32_e32 v27, s37
+; SDAG-NEXT:    v_mov_b32_e32 v28, s38
+; SDAG-NEXT:    v_mov_b32_e32 v29, s39
+; SDAG-NEXT:    v_mov_b32_e32 v30, s40
+; SDAG-NEXT:    v_mov_b32_e32 v31, s41
+; SDAG-NEXT:    v_mov_b32_e32 v32, s42
+; SDAG-NEXT:    v_mov_b32_e32 v33, s43
+; SDAG-NEXT:    v_mov_b32_e32 v18, s44
+; SDAG-NEXT:    v_mov_b32_e32 v19, s45
+; SDAG-NEXT:    v_mov_b32_e32 v20, s46
+; SDAG-NEXT:    v_mov_b32_e32 v21, s47
+; SDAG-NEXT:    v_mov_b32_e32 v22, s48
+; SDAG-NEXT:    v_mov_b32_e32 v23, s49
+; SDAG-NEXT:    v_mov_b32_e32 v24, s50
+; SDAG-NEXT:    v_mov_b32_e32 v25, s51
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -3329,7 +3329,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s0
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s1
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v16, v17 op_sel_hi:[0,0,0]
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v16, v17 op_sel_hi:[0,0,0]
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v20, s22
@@ -3433,29 +3433,28 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__nonmac(<8 x
 define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #1 {
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
+; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; SDAG-NEXT:    v_mov_b32_e32 v16, 42
 ; SDAG-NEXT:    v_mov_b32_e32 v17, 25
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v18, s12
-; SDAG-NEXT:    v_mov_b32_e32 v19, s13
-; SDAG-NEXT:    v_mov_b32_e32 v20, s14
-; SDAG-NEXT:    v_mov_b32_e32 v21, s15
-; SDAG-NEXT:    v_mov_b32_e32 v22, s16
-; SDAG-NEXT:    v_mov_b32_e32 v23, s17
-; SDAG-NEXT:    v_mov_b32_e32 v24, s18
-; SDAG-NEXT:    v_mov_b32_e32 v25, s19
-; SDAG-NEXT:    v_mov_b32_e32 v26, s20
-; SDAG-NEXT:    v_mov_b32_e32 v27, s21
-; SDAG-NEXT:    v_mov_b32_e32 v28, s22
-; SDAG-NEXT:    v_mov_b32_e32 v29, s23
-; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v30, s24
-; SDAG-NEXT:    v_mov_b32_e32 v31, s25
-; SDAG-NEXT:    v_mov_b32_e32 v32, s26
-; SDAG-NEXT:    v_mov_b32_e32 v33, s27
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v26, s36
+; SDAG-NEXT:    v_mov_b32_e32 v27, s37
+; SDAG-NEXT:    v_mov_b32_e32 v28, s38
+; SDAG-NEXT:    v_mov_b32_e32 v29, s39
+; SDAG-NEXT:    v_mov_b32_e32 v30, s40
+; SDAG-NEXT:    v_mov_b32_e32 v31, s41
+; SDAG-NEXT:    v_mov_b32_e32 v32, s42
+; SDAG-NEXT:    v_mov_b32_e32 v33, s43
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v18, s44
+; SDAG-NEXT:    v_mov_b32_e32 v19, s45
+; SDAG-NEXT:    v_mov_b32_e32 v20, s46
+; SDAG-NEXT:    v_mov_b32_e32 v21, s47
+; SDAG-NEXT:    v_mov_b32_e32 v22, s48
+; SDAG-NEXT:    v_mov_b32_e32 v23, s49
+; SDAG-NEXT:    v_mov_b32_e32 v24, s50
+; SDAG-NEXT:    v_mov_b32_e32 v25, s51
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -3464,7 +3463,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[18:25], v[26:33], v[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[26:33], v[18:25], v[0:15], v17, v16 op_sel_hi:[0,0,0] blgp:2
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v19, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v20, s22
@@ -3566,27 +3565,26 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__nonmac(<8
 define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonmac:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v32, s12
-; SDAG-NEXT:    v_mov_b32_e32 v33, s13
-; SDAG-NEXT:    v_mov_b32_e32 v34, s14
-; SDAG-NEXT:    v_mov_b32_e32 v35, s15
-; SDAG-NEXT:    v_mov_b32_e32 v36, s16
-; SDAG-NEXT:    v_mov_b32_e32 v37, s17
-; SDAG-NEXT:    v_mov_b32_e32 v38, s18
-; SDAG-NEXT:    v_mov_b32_e32 v39, s19
-; SDAG-NEXT:    v_mov_b32_e32 v40, s20
-; SDAG-NEXT:    v_mov_b32_e32 v41, s21
-; SDAG-NEXT:    v_mov_b32_e32 v42, s22
-; SDAG-NEXT:    v_mov_b32_e32 v43, s23
+; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
 ; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v44, s24
-; SDAG-NEXT:    v_mov_b32_e32 v45, s25
-; SDAG-NEXT:    v_mov_b32_e32 v46, s26
-; SDAG-NEXT:    v_mov_b32_e32 v47, s27
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v40, s36
+; SDAG-NEXT:    v_mov_b32_e32 v41, s37
+; SDAG-NEXT:    v_mov_b32_e32 v42, s38
+; SDAG-NEXT:    v_mov_b32_e32 v43, s39
+; SDAG-NEXT:    v_mov_b32_e32 v44, s40
+; SDAG-NEXT:    v_mov_b32_e32 v45, s41
+; SDAG-NEXT:    v_mov_b32_e32 v46, s42
+; SDAG-NEXT:    v_mov_b32_e32 v47, s43
 ; SDAG-NEXT:    v_mov_b64_e32 v[30:31], s[22:23]
+; SDAG-NEXT:    v_mov_b32_e32 v32, s44
+; SDAG-NEXT:    v_mov_b32_e32 v33, s45
+; SDAG-NEXT:    v_mov_b32_e32 v34, s46
+; SDAG-NEXT:    v_mov_b32_e32 v35, s47
+; SDAG-NEXT:    v_mov_b32_e32 v36, s48
+; SDAG-NEXT:    v_mov_b32_e32 v37, s49
+; SDAG-NEXT:    v_mov_b32_e32 v38, s50
+; SDAG-NEXT:    v_mov_b32_e32 v39, s51
 ; SDAG-NEXT:    v_mov_b64_e32 v[28:29], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[18:19]
 ; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[16:17]
@@ -3595,7 +3593,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 ; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[32:39], v[40:47], v[16:31] blgp:2
+; SDAG-NEXT:    v_mfma_f32_32x32x64_f8f6f4 v[0:15], v[40:47], v[32:39], v[16:31] blgp:2
 ; SDAG-NEXT:    s_nop 14
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s21
@@ -3690,29 +3688,28 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
 define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac(<8 x i32> %arg0, <8 x i32> %arg1, <16 x float> %arg2) #0 {
 ; SDAG-LABEL: test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_nonmac:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx16 s[12:27], s[4:5], 0x0
+; SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
+; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
 ; SDAG-NEXT:    v_mov_b32_e32 v32, 42
 ; SDAG-NEXT:    v_mov_b32_e32 v33, 25
 ; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v16, s12
-; SDAG-NEXT:    v_mov_b32_e32 v17, s13
-; SDAG-NEXT:    v_mov_b32_e32 v18, s14
-; SDAG-NEXT:    v_mov_b32_e32 v19, s15
-; SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; SDAG-NEXT:    v_mov_b32_e32 v22, s18
-; SDAG-NEXT:    v_mov_b32_e32 v23, s19
-; SDAG-NEXT:    v_mov_b32_e32 v24, s20
-; SDAG-NEXT:    v_mov_b32_e32 v25, s21
-; SDAG-NEXT:    v_mov_b32_e32 v26, s22
-; SDAG-NEXT:    v_mov_b32_e32 v27, s23
-; SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
-; SDAG-NEXT:    v_mov_b32_e32 v28, s24
-; SDAG-NEXT:    v_mov_b32_e32 v29, s25
-; SDAG-NEXT:    v_mov_b32_e32 v30, s26
-; SDAG-NEXT:    v_mov_b32_e32 v31, s27
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_mov_b32_e32 v24, s36
+; SDAG-NEXT:    v_mov_b32_e32 v25, s37
+; SDAG-NEXT:    v_mov_b32_e32 v26, s38
+; SDAG-NEXT:    v_mov_b32_e32 v27, s39
+; SDAG-NEXT:    v_mov_b32_e32 v28, s40
+; SDAG-NEXT:    v_mov_b32_e32 v29, s41
+; SDAG-NEXT:    v_mov_b32_e32 v30, s42
+; SDAG-NEXT:    v_mov_b32_e32 v31, s43
 ; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; SDAG-NEXT:    v_mov_b32_e32 v16, s44
+; SDAG-NEXT:    v_mov_b32_e32 v17, s45
+; SDAG-NEXT:    v_mov_b32_e32 v18, s46
+; SDAG-NEXT:    v_mov_b32_e32 v19, s47
+; SDAG-NEXT:    v_mov_b32_e32 v20, s48
+; SDAG-NEXT:    v_mov_b32_e32 v21, s49
+; SDAG-NEXT:    v_mov_b32_e32 v22, s50
+; SDAG-NEXT:    v_mov_b32_e32 v23, s51
 ; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
 ; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
 ; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
@@ -3721,7 +3718,7 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_25_42__vgprcd_non
 ; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[20:21]
 ; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[22:23]
 ; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[16:23], v[24:31], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
+; SDAG-NEXT:    v_mfma_scale_f32_32x32x64_f8f6f4 v[0:15], v[24:31], v[16:23], v[0:15], v33, v32 op_sel_hi:[0,0,0] blgp:2
 ; SDAG-NEXT:    v_mov_b32_e32 v16, s20
 ; SDAG-NEXT:    v_mov_b32_e32 v17, s21
 ; SDAG-NEXT:    v_mov_b32_e32 v18, s22
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
index c9615f478e5b5..0b3404e76c48c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
@@ -30,26 +30,25 @@ define amdgpu_vs void @test(ptr addrspace(8) inreg %arg1, ptr addrspace(3) %arg2
 define amdgpu_vs void @test_2(ptr addrspace(8) inreg %arg1, i32 %arg2, i32 inreg %arg3, ptr addrspace(3) %arg4) {
 ; CHECK-LABEL: test_2:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 20, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 16, v1
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 28, v1
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 24, v1
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, 12, v1
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 8, v1
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, 4, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 12, v1
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 8, v1
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, 4, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 28, v1
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, 24, v1
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, 20, v1
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, 16, v1
 ; CHECK-NEXT:    s_mov_b32 m0, -1
-; CHECK-NEXT:    ds_read_b32 v2, v2
-; CHECK-NEXT:    ds_read_b32 v5, v4
-; CHECK-NEXT:    ds_read_b32 v4, v6
-; CHECK-NEXT:    ds_read_b32 v9, v7
-; CHECK-NEXT:    ds_read_b32 v8, v8
-; CHECK-NEXT:    ds_read_b32 v7, v10
-; CHECK-NEXT:    ds_read_b32 v6, v1
+; CHECK-NEXT:    ds_read_b32 v4, v2
 ; CHECK-NEXT:    ds_read_b32 v3, v3
-; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
-; CHECK-NEXT:    tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc
+; CHECK-NEXT:    ds_read_b32 v2, v5
+; CHECK-NEXT:    ds_read_b32 v8, v6
+; CHECK-NEXT:    ds_read_b32 v7, v7
+; CHECK-NEXT:    ds_read_b32 v6, v9
+; CHECK-NEXT:    ds_read_b32 v5, v10
+; CHECK-NEXT:    ds_read_b32 v1, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc
+; CHECK-NEXT:    tbuffer_store_format_xyzw v[1:4], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc
+; CHECK-NEXT:    tbuffer_store_format_xyzw v[5:8], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc
 ; CHECK-NEXT:    s_endpgm
   %load = load <8 x float>, ptr addrspace(3) %arg4, align 4
   %vec1 = shufflevector <8 x float> %load, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index a84466bc1ca1a..922ed2bd63fee 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -10,10 +10,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-ONLY
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512BWVL-ONLY
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512VBMI,AVX512VBMI-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL,AVX512VBMI,AVX512VBMI-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
 
 ; PR31551
 ; Pairs of shufflevector:trunc functions with functional equivalence.
@@ -889,253 +889,54 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
 ;
 ; AVX1-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    pushq %rbx
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpextrw $6, %xmm2, %eax
-; AVX1-NEXT:    vpextrw $4, %xmm2, %ecx
-; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
-; AVX1-NEXT:    vmovd %xmm2, %esi
-; AVX1-NEXT:    vpextrw $6, %xmm1, %edi
-; AVX1-NEXT:    vpextrw $4, %xmm1, %r8d
-; AVX1-NEXT:    vpextrw $2, %xmm1, %r9d
-; AVX1-NEXT:    vmovd %xmm1, %r10d
+; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrw $6, %xmm1, %r11d
-; AVX1-NEXT:    vpextrw $4, %xmm1, %ebx
-; AVX1-NEXT:    vpextrw $2, %xmm1, %ebp
-; AVX1-NEXT:    vmovd %xmm1, %r14d
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpextrw $6, %xmm2, %eax
-; AVX2-NEXT:    vpextrw $4, %xmm2, %ecx
-; AVX2-NEXT:    vpextrw $2, %xmm2, %edx
-; AVX2-NEXT:    vmovd %xmm2, %esi
-; AVX2-NEXT:    vpextrw $6, %xmm1, %edi
-; AVX2-NEXT:    vpextrw $4, %xmm1, %r8d
-; AVX2-NEXT:    vpextrw $2, %xmm1, %r9d
-; AVX2-NEXT:    vmovd %xmm1, %r10d
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrw $6, %xmm1, %r11d
-; AVX2-NEXT:    vpextrw $4, %xmm1, %ebx
-; AVX2-NEXT:    vpextrw $2, %xmm1, %ebp
-; AVX2-NEXT:    vmovd %xmm1, %r14d
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    vpmovdb %zmm0, %xmm1
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT:    vpextrw $6, %xmm2, %eax
-; AVX512F-NEXT:    vpextrw $4, %xmm2, %ecx
-; AVX512F-NEXT:    vpextrw $2, %xmm2, %edx
-; AVX512F-NEXT:    vmovd %xmm2, %esi
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512F-NEXT:    vpextrw $6, %xmm2, %edi
-; AVX512F-NEXT:    vpextrw $4, %xmm2, %r8d
-; AVX512F-NEXT:    vpextrw $2, %xmm2, %r9d
-; AVX512F-NEXT:    vmovd %xmm2, %r10d
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT:    vpextrw $6, %xmm0, %r11d
-; AVX512F-NEXT:    vpextrw $4, %xmm0, %ebx
-; AVX512F-NEXT:    vpextrw $2, %xmm0, %ebp
-; AVX512F-NEXT:    vpinsrb $5, %ebp, %xmm1, %xmm0
-; AVX512F-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %rbp
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rbp
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $6, %xmm1, %eax
-; AVX512VL-NEXT:    vpextrw $4, %xmm1, %ecx
-; AVX512VL-NEXT:    vpextrw $2, %xmm1, %edx
-; AVX512VL-NEXT:    vmovd %xmm1, %esi
-; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $6, %xmm1, %edi
-; AVX512VL-NEXT:    vpextrw $4, %xmm1, %r8d
-; AVX512VL-NEXT:    vpextrw $2, %xmm1, %r9d
-; AVX512VL-NEXT:    vmovd %xmm1, %r10d
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $6, %xmm1, %r11d
-; AVX512VL-NEXT:    vpextrw $4, %xmm1, %ebx
-; AVX512VL-NEXT:    vpextrw $2, %xmm1, %ebp
-; AVX512VL-NEXT:    vmovd %xmm1, %r14d
-; AVX512VL-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %rbp
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
+; AVX2-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
+; AVX2-SLOW:       # %bb.0:
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT:    vzeroupper
+; AVX2-SLOW-NEXT:    retq
 ;
-; AVX512BW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm1
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrw $6, %xmm2, %eax
-; AVX512BW-NEXT:    vpextrw $4, %xmm2, %ecx
-; AVX512BW-NEXT:    vpextrw $2, %xmm2, %edx
-; AVX512BW-NEXT:    vmovd %xmm2, %esi
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT:    vpextrw $6, %xmm2, %edi
-; AVX512BW-NEXT:    vpextrw $4, %xmm2, %r8d
-; AVX512BW-NEXT:    vpextrw $2, %xmm2, %r9d
-; AVX512BW-NEXT:    vmovd %xmm2, %r10d
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT:    vpextrw $6, %xmm0, %r11d
-; AVX512BW-NEXT:    vpextrw $4, %xmm0, %ebx
-; AVX512BW-NEXT:    vpextrw $2, %xmm0, %ebp
-; AVX512BW-NEXT:    vpinsrb $5, %ebp, %xmm1, %xmm0
-; AVX512BW-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX2-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
+; AVX2-FAST:       # %bb.0:
+; AVX2-FAST-NEXT:    vpmovsxdq {{.*#+}} ymm2 = [84148480,218892552,353636624,488380696]
+; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
+; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-FAST-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-FAST-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT:    vzeroupper
+; AVX2-FAST-NEXT:    retq
 ;
-; AVX512BWVL-ONLY-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BWVL-ONLY:       # %bb.0:
-; AVX512BWVL-ONLY-NEXT:    pushq %rbp
-; AVX512BWVL-ONLY-NEXT:    pushq %r14
-; AVX512BWVL-ONLY-NEXT:    pushq %rbx
-; AVX512BWVL-ONLY-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $6, %xmm1, %eax
-; AVX512BWVL-ONLY-NEXT:    vpextrw $4, %xmm1, %ecx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $2, %xmm1, %edx
-; AVX512BWVL-ONLY-NEXT:    vmovd %xmm1, %esi
-; AVX512BWVL-ONLY-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $6, %xmm1, %edi
-; AVX512BWVL-ONLY-NEXT:    vpextrw $4, %xmm1, %r8d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $2, %xmm1, %r9d
-; AVX512BWVL-ONLY-NEXT:    vmovd %xmm1, %r10d
-; AVX512BWVL-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $6, %xmm1, %r11d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $4, %xmm1, %ebx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $2, %xmm1, %ebp
-; AVX512BWVL-ONLY-NEXT:    vmovd %xmm1, %r14d
-; AVX512BWVL-ONLY-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    popq %rbx
-; AVX512BWVL-ONLY-NEXT:    popq %r14
-; AVX512BWVL-ONLY-NEXT:    popq %rbp
-; AVX512BWVL-ONLY-NEXT:    vzeroupper
-; AVX512BWVL-ONLY-NEXT:    retq
-;
-; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-FAST:       # %bb.0:
-; AVX512VBMI-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79]
-; AVX512VBMI-FAST-NEXT:    vpmovdb %ymm0, %xmm2
-; AVX512VBMI-FAST-NEXT:    vpermi2b %zmm2, %zmm0, %zmm1
-; AVX512VBMI-FAST-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VBMI-FAST-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX512VBMI-FAST-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
-; AVX512VBMI-FAST-NEXT:    vzeroupper
-; AVX512VBMI-FAST-NEXT:    retq
-;
-; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-SLOW:       # %bb.0:
-; AVX512VBMI-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15]
-; AVX512VBMI-SLOW-NEXT:    vpmovdb %ymm0, %xmm2
-; AVX512VBMI-SLOW-NEXT:    vpermt2b %zmm0, %zmm1, %zmm2
-; AVX512VBMI-SLOW-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpextrw $6, %xmm0, %eax
-; AVX512VBMI-SLOW-NEXT:    vpextrw $4, %xmm0, %ecx
-; AVX512VBMI-SLOW-NEXT:    vpextrw $2, %xmm0, %edx
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $13, %edx, %xmm2, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vzeroupper
-; AVX512VBMI-SLOW-NEXT:    retq
+; AVX512-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %n0 = shufflevector <32 x i16> %n2, <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
   %n1 = trunc <16 x i16> %n0 to <16 x i8>
   ret <16 x i8> %n1
@@ -1230,260 +1031,41 @@ define <16 x i8> @oddelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
 ;
 ; AVX1-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    pushq %r14
-; AVX1-NEXT:    pushq %rbx
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpextrw $7, %xmm2, %eax
-; AVX1-NEXT:    vpextrw $5, %xmm2, %ecx
-; AVX1-NEXT:    vpextrw $3, %xmm2, %edx
-; AVX1-NEXT:    vpextrw $1, %xmm2, %esi
-; AVX1-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX1-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX1-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX1-NEXT:    vpextrw $1, %xmm1, %r10d
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX1-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX1-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX1-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX1-NEXT:    popq %rbx
-; AVX1-NEXT:    popq %r14
-; AVX1-NEXT:    popq %rbp
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT:    vpextrw $7, %xmm2, %eax
-; AVX2-NEXT:    vpextrw $5, %xmm2, %ecx
-; AVX2-NEXT:    vpextrw $3, %xmm2, %edx
-; AVX2-NEXT:    vpextrw $1, %xmm2, %esi
-; AVX2-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX2-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX2-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX2-NEXT:    vpextrw $1, %xmm1, %r10d
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,6,7,10,11,14,15,2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX2-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX2-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX2-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512F-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512F-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX512F-NEXT:    vpextrw $3, %xmm1, %edx
-; AVX512F-NEXT:    vpextrw $1, %xmm1, %esi
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX512F-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX512F-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX512F-NEXT:    vpextrw $1, %xmm1, %r10d
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX512F-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX512F-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX512F-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512F-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %rbp
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rbp
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX512VL-NEXT:    vpextrw $3, %xmm1, %edx
-; AVX512VL-NEXT:    vpextrw $1, %xmm1, %esi
-; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX512VL-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX512VL-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX512VL-NEXT:    vpextrw $1, %xmm1, %r10d
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX512VL-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX512VL-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX512VL-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %rbp
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512BW-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    pushq %rbp
-; AVX512BW-NEXT:    pushq %r14
-; AVX512BW-NEXT:    pushq %rbx
-; AVX512BW-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512BW-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX512BW-NEXT:    vpextrw $3, %xmm1, %edx
-; AVX512BW-NEXT:    vpextrw $1, %xmm1, %esi
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BW-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX512BW-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX512BW-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX512BW-NEXT:    vpextrw $1, %xmm1, %r10d
-; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX512BW-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX512BW-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX512BW-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT:    popq %rbx
-; AVX512BW-NEXT:    popq %r14
-; AVX512BW-NEXT:    popq %rbp
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
-;
-; AVX512BWVL-ONLY-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BWVL-ONLY:       # %bb.0:
-; AVX512BWVL-ONLY-NEXT:    pushq %rbp
-; AVX512BWVL-ONLY-NEXT:    pushq %r14
-; AVX512BWVL-ONLY-NEXT:    pushq %rbx
-; AVX512BWVL-ONLY-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $7, %xmm1, %eax
-; AVX512BWVL-ONLY-NEXT:    vpextrw $5, %xmm1, %ecx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $3, %xmm1, %edx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $1, %xmm1, %esi
-; AVX512BWVL-ONLY-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $7, %xmm1, %edi
-; AVX512BWVL-ONLY-NEXT:    vpextrw $5, %xmm1, %r8d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $3, %xmm1, %r9d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $1, %xmm1, %r10d
-; AVX512BWVL-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-ONLY-NEXT:    vpextrw $7, %xmm1, %r11d
-; AVX512BWVL-ONLY-NEXT:    vpextrw $5, %xmm1, %ebx
-; AVX512BWVL-ONLY-NEXT:    vpextrw $3, %xmm1, %ebp
-; AVX512BWVL-ONLY-NEXT:    vpextrw $1, %xmm1, %r14d
-; AVX512BWVL-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT:    popq %rbx
-; AVX512BWVL-ONLY-NEXT:    popq %r14
-; AVX512BWVL-ONLY-NEXT:    popq %rbp
-; AVX512BWVL-ONLY-NEXT:    vzeroupper
-; AVX512BWVL-ONLY-NEXT:    retq
-;
-; AVX512VBMI-FAST-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-FAST:       # %bb.0:
-; AVX512VBMI-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
-; AVX512VBMI-FAST-NEXT:    vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-FAST-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI-FAST-NEXT:    vzeroupper
-; AVX512VBMI-FAST-NEXT:    retq
-;
-; AVX512VBMI-SLOW-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-SLOW:       # %bb.0:
-; AVX512VBMI-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,6,10,14,18,22,26,30,34,38,42,46,50,u,u,u]
-; AVX512VBMI-SLOW-NEXT:    vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-SLOW-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpextrw $7, %xmm0, %eax
-; AVX512VBMI-SLOW-NEXT:    vpextrw $5, %xmm0, %ecx
-; AVX512VBMI-SLOW-NEXT:    vpextrw $3, %xmm0, %edx
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT:    vzeroupper
-; AVX512VBMI-SLOW-NEXT:    retq
+; AVX512-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpsrld $16, %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %n0 = shufflevector <32 x i16> %n2, <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
   %n1 = trunc <16 x i16> %n0 to <16 x i8>
   ret <16 x i8> %n1
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX512VBMI: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
index ad345213c1472..801f5929e35d8 100644
--- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll
@@ -60,15 +60,15 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d
 define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; SSE-LABEL: do_not_use_256bit_op:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: do_not_use_256bit_op:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll
index ac330a7e60396..8c5db490cce73 100644
--- a/llvm/test/CodeGen/X86/vselect-avx.ll
+++ b/llvm/test/CodeGen/X86/vselect-avx.ll
@@ -263,11 +263,11 @@ define <4 x i64> @vselect_concat_split_v16i8(<4 x i64> %a, <4 x i64> %b, <4 x i6
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm5
 ; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT:    vpblendvb %xmm4, %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: vselect_concat_split_v16i8:



More information about the llvm-commits mailing list